2022-07-08 00:55:01 +02:00
//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
2022-07-06 21:11:32 +02:00
//
// Purpose: - defines SIMD "structure of arrays" classes and functions.
//
//===========================================================================//
# ifndef SSEMATH_H
# define SSEMATH_H
# if defined( _X360 )
# include <xboxmath.h>
2022-07-08 00:55:01 +02:00
# elif defined ( _PS3 )
# include <vectormath/c/vectormath_aos.h>
# include <vectormath/c/vectormath_aos_v.h>
2022-07-06 21:11:32 +02:00
# else
# include <xmmintrin.h>
2022-07-08 00:55:01 +02:00
# ifndef _LINUX
# include <emmintrin.h>
2022-07-06 21:11:32 +02:00
# endif
# endif
2022-07-08 00:55:01 +02:00
# ifndef SPU
# include "mathlib/vector.h"
# include "mathlib/mathlib.h"
2022-07-06 21:11:32 +02:00
# else
2022-07-08 00:55:01 +02:00
# include "mathlib/math_pfns.h"
2022-07-06 21:11:32 +02:00
# endif
2022-07-08 00:55:01 +02:00
# include "mathlib/fltx4.h"
2022-07-06 21:11:32 +02:00
// The FLTX4 type is a fltx4 used as a parameter to a function.
// On the 360, the best way to do this is pass-by-copy on the registers.
// On the PC, the best way is to pass by const reference.
// The compiler will sometimes, but not always, replace a pass-by-const-ref
// with a pass-in-reg on the 360; to avoid this confusion, you can
// explicitly use a FLTX4 as the parameter type.
# ifdef _X360
typedef __vector4 FLTX4 ;
2022-07-08 00:55:01 +02:00
# elif defined( _PS3 )
typedef vec_float4 FLTX4 ;
2022-07-06 21:11:32 +02:00
# else
typedef const fltx4 & FLTX4 ;
# endif
2022-09-09 19:47:31 +02:00
// A 16-byte aligned int32 data structure
2022-07-06 21:11:32 +02:00
// (for use when writing out fltx4's as SIGNED
// ints).
struct ALIGN16 intx4
{
int32 m_i32 [ 4 ] ;
inline int & operator [ ] ( int which )
{
return m_i32 [ which ] ;
}
inline const int & operator [ ] ( int which ) const
{
return m_i32 [ which ] ;
}
inline int32 * Base ( ) {
return m_i32 ;
}
inline const int32 * Base ( ) const
{
return m_i32 ;
}
2022-07-08 00:55:01 +02:00
inline bool operator = = ( const intx4 & other ) const
2022-07-06 21:11:32 +02:00
{
return m_i32 [ 0 ] = = other . m_i32 [ 0 ] & &
m_i32 [ 1 ] = = other . m_i32 [ 1 ] & &
m_i32 [ 2 ] = = other . m_i32 [ 2 ] & &
m_i32 [ 3 ] = = other . m_i32 [ 3 ] ;
}
} ALIGN16_POST ;
# if defined( _DEBUG ) && defined( _X360 )
FORCEINLINE void TestVPUFlags ( )
{
// Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com)
__vector4 a ;
__asm
{
mfvscr a ;
}
unsigned int * flags = ( unsigned int * ) & a ;
unsigned int controlWord = flags [ 3 ] ;
Assert ( controlWord = = 0 ) ;
}
# else // _DEBUG
FORCEINLINE void TestVPUFlags ( ) { }
# endif // _DEBUG
// useful constants in SIMD packed float format:
// (note: some of these aren't stored on the 360,
// but are manufactured directly in one or two
// instructions, saving a load and possible L2
// miss.)
2022-07-08 00:55:01 +02:00
# ifdef _X360
// Shouldn't the PS3 have something similar?
# define Four_Zeros XMVectorZero() // 0 0 0 0
# define Four_Ones XMVectorSplatOne() // 1 1 1 1
2022-07-06 21:11:32 +02:00
extern const fltx4 Four_Twos ; // 2 2 2 2
extern const fltx4 Four_Threes ; // 3 3 3 3
extern const fltx4 Four_Fours ; // guess.
extern const fltx4 Four_Point225s ; // .225 .225 .225 .225
extern const fltx4 Four_PointFives ; // .5 .5 .5 .5
2022-07-08 00:55:01 +02:00
extern const fltx4 Four_Thirds ; // 1/3
extern const fltx4 Four_TwoThirds ; // 2/3
2022-07-06 21:11:32 +02:00
extern const fltx4 Four_NegativeOnes ; // -1 -1 -1 -1
2022-07-08 00:55:01 +02:00
extern const fltx4 Four_DegToRad ; // (float)(M_PI_F / 180.f) times four
# elif defined(SPU)
# define Four_Zeros spu_splats( 0.0f ) // 0 0 0 0
# define Four_Ones spu_splats( 1.0f ) // 1 1 1 1
# define Four_Twos spu_splats( 2.0f ) // 2 2 2 2
# define Four_Threes spu_splats( 3.0f ) // 3 3 3 3
# define Four_Fours spu_splats( 4.0f ) // guess.
# define Four_Point225s spu_splats( 0.225f ) // .225 .225 .225 .225
# define Four_PointFives spu_splats( 0.5f ) // .5 .5 .5 .5
# define Four_Thirds spu_splats( 0.33333333 ); // 1/3
# define Four_TwoThirds spu_splats( 0.66666666 ); // 2/3
# define Four_NegativeOnes spu_splats( -1.0f ) // -1 -1 -1 -1
# define Four_DegToRad spu_splats((float)(M_PI_F / 180.f))
2022-07-06 21:11:32 +02:00
# else
2022-07-08 00:55:01 +02:00
extern const fltx4 Four_Zeros ; // 0 0 0 0
extern const fltx4 Four_Ones ; // 1 1 1 1
2022-07-06 21:11:32 +02:00
extern const fltx4 Four_Twos ; // 2 2 2 2
extern const fltx4 Four_Threes ; // 3 3 3 3
extern const fltx4 Four_Fours ; // guess.
extern const fltx4 Four_Point225s ; // .225 .225 .225 .225
extern const fltx4 Four_PointFives ; // .5 .5 .5 .5
2022-07-08 00:55:01 +02:00
extern const fltx4 Four_Thirds ; // 1/3
extern const fltx4 Four_TwoThirds ; // 2/3
extern const fltx4 Four_NegativeOnes ; // -1 -1 -1 -1
extern const fltx4 Four_DegToRad ; // (float)(M_PI_F / 180.f) times four
# endif
2022-07-06 21:11:32 +02:00
extern const fltx4 Four_Epsilons ; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
extern const fltx4 Four_2ToThe21s ; // (1<<21)..
extern const fltx4 Four_2ToThe22s ; // (1<<22)..
extern const fltx4 Four_2ToThe23s ; // (1<<23)..
extern const fltx4 Four_2ToThe24s ; // (1<<24)..
extern const fltx4 Four_Origin ; // 0 0 0 1 (origin point, like vr0 on the PS2)
extern const fltx4 Four_FLT_MAX ; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
extern const fltx4 Four_Negative_FLT_MAX ; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
extern const fltx4 g_SIMD_0123 ; // 0 1 2 3 as float
2022-07-08 00:55:01 +02:00
// coefficients for polynomial approximation of srgb conversions
// 4th order polynomial for x^(1/2.2), x in 0..1
extern const fltx4 Four_LinearToGammaCoefficients_A ; // *x^4
extern const fltx4 Four_LinearToGammaCoefficients_B ; // *x^3
extern const fltx4 Four_LinearToGammaCoefficients_C ; // *x^2
extern const fltx4 Four_LinearToGammaCoefficients_D ; // *x^1
extern const fltx4 Four_LinearToGammaCoefficients_E ; // *x^0
// 3rd order polynomial for x^2.2 x in 0..1
extern const fltx4 Four_GammaToLinearCoefficients_A ; // *x^3
extern const fltx4 Four_GammaToLinearCoefficients_B ; // *x^2
extern const fltx4 Four_GammaToLinearCoefficients_C ; // *x^1
extern const fltx4 Four_GammaToLinearCoefficients_D ; // *x^0
2022-07-06 21:11:32 +02:00
// external aligned integer constants
2022-07-08 00:55:01 +02:00
# ifndef ALIGN16_POST
# define ALIGN16_POST
# endif
extern const fltx4 g_SIMD_Identity [ 4 ] ; // [1 0 0 0], [0 1 0 0], [0 0 1 0], [0 0 0 1]
2023-04-01 21:10:49 +02:00
extern const ALIGN16 uint32 g_SIMD_clear_signmask [ ] ALIGN16_POST ; // 0x7fffffff x 4
extern const ALIGN16 uint32 g_SIMD_signmask [ ] ALIGN16_POST ; // 0x80000000 x 4
extern const ALIGN16 uint32 g_SIMD_lsbmask [ ] ALIGN16_POST ; // 0xfffffffe x 4
extern const ALIGN16 uint32 g_SIMD_clear_wmask [ ] ALIGN16_POST ; // -1 -1 -1 0
extern const ALIGN16 uint32 g_SIMD_ComponentMask [ 4 ] [ 4 ] ALIGN16_POST ; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
extern const ALIGN16 uint32 g_SIMD_AllOnesMask [ ] ALIGN16_POST ; // ~0,~0,~0,~0
extern const ALIGN16 uint32 g_SIMD_Low16BitsMask [ ] ALIGN16_POST ; // 0xffff x 4
2022-07-06 21:11:32 +02:00
// this mask is used for skipping the tail of things. If you have N elements in an array, and wish
// to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
2023-04-01 21:10:49 +02:00
extern const uint32 ALIGN16 g_SIMD_SkipTailMask [ 4 ] [ 4 ] ALIGN16_POST ;
2022-07-06 21:11:32 +02:00
2023-04-01 21:10:49 +02:00
extern const uint32 ALIGN16 g_SIMD_EveryOtherMask [ ] ; // 0, ~0, 0, ~0
2022-07-06 21:11:32 +02:00
// Define prefetch macros.
// The characteristics of cache and prefetch are completely
// different between the different platforms, so you DO NOT
// want to just define one macro that maps to every platform
// intrinsic under the hood -- you need to prefetch at different
// intervals between x86 and PPC, for example, and that is
// a higher level code change.
// On the other hand, I'm tired of typing #ifdef _X360
// all over the place, so this is just a nop on Intel, PS3.
2022-07-08 00:55:01 +02:00
# ifdef PLATFORM_PPC
# if defined(_X360)
2022-07-06 21:11:32 +02:00
# define PREFETCH360(address, offset) __dcbt(offset,address)
2022-07-08 00:55:01 +02:00
# elif defined(_PS3)
# define PREFETCH360(address, offset) __dcbt( reinterpret_cast< const char * >(address) + offset )
# else
# error Prefetch not defined for this platform!
# endif
2022-07-06 21:11:32 +02:00
# else
# define PREFETCH360(x,y) // nothing
# endif
2022-07-08 00:55:01 +02:00
// Here's a handy function to align a pointer to the next
// sixteen byte boundary -- it'll round it up to the nearest
// multiple of 16. This is useful if you're subdividing
// big swaths of allocated memory, but in that case, remember
// to leave yourself the necessary slack in the allocation.
template < class T >
inline T * AlignPointer ( void * ptr )
{
# if defined( __clang__ )
uintp temp = ( uintp ) ptr ;
# else
unsigned temp = ptr ;
# endif
temp = ALIGN_VALUE ( temp , sizeof ( T ) ) ;
return ( T * ) temp ;
}
# ifdef _PS3
// Note that similar defines exist in math_pfns.h
// Maybe we should consolidate in one place for all platforms.
# define _VEC_CLEAR_SIGNMASK (__vector unsigned int) {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}
# define _VEC_SIGNMASK (__vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }
# define _VEC_LSBMASK (__vector unsigned int) { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe }
# define _VEC_CLEAR_WMASK (__vector unsigned int) {0xffffffff, 0xffffffff, 0xffffffff, 0}
# define _VEC_COMPONENT_MASK_0 (__vector unsigned int) {0xffffffff, 0, 0, 0}
# define _VEC_COMPONENT_MASK_1 (__vector unsigned int) {0, 0xffffffff, 0, 0}
# define _VEC_COMPONENT_MASK_2 (__vector unsigned int) {0, 0, 0xffffffff, 0}
# define _VEC_COMPONENT_MASK_3 (__vector unsigned int) {0, 0, 0, 0xffffffff}
# define _VEC_SWIZZLE_WZYX (__vector unsigned char) { 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b, 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03 }
# define _VEC_SWIZZLE_ZWXY (__vector unsigned char) { 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07 }
# define _VEC_SWIZZLE_YXWZ (__vector unsigned char) { 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03, 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b }
# define _VEC_ZERO (__vector unsigned int) {0,0,0,0}
# define _VEC_FLTMAX (__vector float) {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}
# define _VEC_FLTMIN (__vector float) {FLT_MIN,FLT_MIN,FLT_MIN,FLT_MIN}
# define _VEC_ORIGIN (__vector unsigned int) { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }
# endif
2022-07-06 21:11:32 +02:00
# if USE_STDC_FOR_SIMD
//---------------------------------------------------------------------
// Standard C (fallback/Linux) implementation (only there for compat - slow)
//---------------------------------------------------------------------
FORCEINLINE float SubFloat ( const fltx4 & a , int idx )
{
return a . m128_f32 [ idx ] ;
}
FORCEINLINE float & SubFloat ( fltx4 & a , int idx )
{
return a . m128_f32 [ idx ] ;
}
FORCEINLINE uint32 SubInt ( const fltx4 & a , int idx )
{
return a . m128_u32 [ idx ] ;
}
FORCEINLINE uint32 & SubInt ( fltx4 & a , int idx )
{
return a . m128_u32 [ idx ] ;
}
// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD ( void )
{
return Four_Zeros ;
}
// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD ( void )
{
return Four_Ones ;
}
FORCEINLINE fltx4 SplatXSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = SubFloat ( a , 0 ) ;
SubFloat ( retVal , 1 ) = SubFloat ( a , 0 ) ;
SubFloat ( retVal , 2 ) = SubFloat ( a , 0 ) ;
SubFloat ( retVal , 3 ) = SubFloat ( a , 0 ) ;
return retVal ;
}
FORCEINLINE fltx4 SplatYSIMD ( fltx4 a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = SubFloat ( a , 1 ) ;
SubFloat ( retVal , 1 ) = SubFloat ( a , 1 ) ;
SubFloat ( retVal , 2 ) = SubFloat ( a , 1 ) ;
SubFloat ( retVal , 3 ) = SubFloat ( a , 1 ) ;
return retVal ;
}
FORCEINLINE fltx4 SplatZSIMD ( fltx4 a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = SubFloat ( a , 2 ) ;
SubFloat ( retVal , 1 ) = SubFloat ( a , 2 ) ;
SubFloat ( retVal , 2 ) = SubFloat ( a , 2 ) ;
SubFloat ( retVal , 3 ) = SubFloat ( a , 2 ) ;
return retVal ;
}
FORCEINLINE fltx4 SplatWSIMD ( fltx4 a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = SubFloat ( a , 3 ) ;
SubFloat ( retVal , 1 ) = SubFloat ( a , 3 ) ;
SubFloat ( retVal , 2 ) = SubFloat ( a , 3 ) ;
SubFloat ( retVal , 3 ) = SubFloat ( a , 3 ) ;
return retVal ;
}
FORCEINLINE fltx4 SetXSIMD ( const fltx4 & a , const fltx4 & x )
{
fltx4 result = a ;
SubFloat ( result , 0 ) = SubFloat ( x , 0 ) ;
return result ;
}
FORCEINLINE fltx4 SetYSIMD ( const fltx4 & a , const fltx4 & y )
{
fltx4 result = a ;
SubFloat ( result , 1 ) = SubFloat ( y , 1 ) ;
return result ;
}
FORCEINLINE fltx4 SetZSIMD ( const fltx4 & a , const fltx4 & z )
{
fltx4 result = a ;
SubFloat ( result , 2 ) = SubFloat ( z , 2 ) ;
return result ;
}
FORCEINLINE fltx4 SetWSIMD ( const fltx4 & a , const fltx4 & w )
{
fltx4 result = a ;
SubFloat ( result , 3 ) = SubFloat ( w , 3 ) ;
return result ;
}
FORCEINLINE fltx4 SetComponentSIMD ( const fltx4 & a , int nComponent , float flValue )
{
fltx4 result = a ;
SubFloat ( result , nComponent ) = flValue ;
return result ;
}
2022-07-08 00:55:01 +02:00
2022-07-06 21:11:32 +02:00
// a b c d -> b c d a
FORCEINLINE fltx4 RotateLeft ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = SubFloat ( a , 1 ) ;
SubFloat ( retVal , 1 ) = SubFloat ( a , 2 ) ;
SubFloat ( retVal , 2 ) = SubFloat ( a , 3 ) ;
SubFloat ( retVal , 3 ) = SubFloat ( a , 0 ) ;
return retVal ;
}
// a b c d -> c d a b
FORCEINLINE fltx4 RotateLeft2 ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = SubFloat ( a , 2 ) ;
SubFloat ( retVal , 1 ) = SubFloat ( a , 3 ) ;
SubFloat ( retVal , 2 ) = SubFloat ( a , 0 ) ;
SubFloat ( retVal , 3 ) = SubFloat ( a , 1 ) ;
return retVal ;
}
# define BINOP(op) \
fltx4 retVal ; \
SubFloat ( retVal , 0 ) = ( SubFloat ( a , 0 ) op SubFloat ( b , 0 ) ) ; \
SubFloat ( retVal , 1 ) = ( SubFloat ( a , 1 ) op SubFloat ( b , 1 ) ) ; \
SubFloat ( retVal , 2 ) = ( SubFloat ( a , 2 ) op SubFloat ( b , 2 ) ) ; \
SubFloat ( retVal , 3 ) = ( SubFloat ( a , 3 ) op SubFloat ( b , 3 ) ) ; \
return retVal ;
# define IBINOP(op) \
fltx4 retVal ; \
SubInt ( retVal , 0 ) = ( SubInt ( a , 0 ) op SubInt ( b , 0 ) ) ; \
SubInt ( retVal , 1 ) = ( SubInt ( a , 1 ) op SubInt ( b , 1 ) ) ; \
SubInt ( retVal , 2 ) = ( SubInt ( a , 2 ) op SubInt ( b , 2 ) ) ; \
SubInt ( retVal , 3 ) = ( SubInt ( a , 3 ) op SubInt ( b , 3 ) ) ; \
return retVal ;
FORCEINLINE fltx4 AddSIMD ( const fltx4 & a , const fltx4 & b )
{
BINOP ( + ) ;
}
FORCEINLINE fltx4 SubSIMD ( const fltx4 & a , const fltx4 & b ) // a-b
{
BINOP ( - ) ;
} ;
FORCEINLINE fltx4 MulSIMD ( const fltx4 & a , const fltx4 & b ) // a*b
{
BINOP ( * ) ;
}
FORCEINLINE fltx4 DivSIMD ( const fltx4 & a , const fltx4 & b ) // a/b
{
BINOP ( / ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 DivEstSIMD ( const fltx4 & a , const fltx4 & b ) // a/b
{
BINOP ( / ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 MaddSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // a*b + c
{
return AddSIMD ( MulSIMD ( a , b ) , c ) ;
}
FORCEINLINE fltx4 MsubSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // c - a*b
{
return SubSIMD ( c , MulSIMD ( a , b ) ) ;
} ;
FORCEINLINE fltx4 SinSIMD ( const fltx4 & radians )
{
fltx4 result ;
SubFloat ( result , 0 ) = sin ( SubFloat ( radians , 0 ) ) ;
SubFloat ( result , 1 ) = sin ( SubFloat ( radians , 1 ) ) ;
SubFloat ( result , 2 ) = sin ( SubFloat ( radians , 2 ) ) ;
SubFloat ( result , 3 ) = sin ( SubFloat ( radians , 3 ) ) ;
return result ;
}
FORCEINLINE void SinCos3SIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians )
{
SinCos ( SubFloat ( radians , 0 ) , & SubFloat ( sine , 0 ) , & SubFloat ( cosine , 0 ) ) ;
SinCos ( SubFloat ( radians , 1 ) , & SubFloat ( sine , 1 ) , & SubFloat ( cosine , 1 ) ) ;
SinCos ( SubFloat ( radians , 2 ) , & SubFloat ( sine , 2 ) , & SubFloat ( cosine , 2 ) ) ;
}
FORCEINLINE void SinCosSIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians )
{
SinCos ( SubFloat ( radians , 0 ) , & SubFloat ( sine , 0 ) , & SubFloat ( cosine , 0 ) ) ;
SinCos ( SubFloat ( radians , 1 ) , & SubFloat ( sine , 1 ) , & SubFloat ( cosine , 1 ) ) ;
SinCos ( SubFloat ( radians , 2 ) , & SubFloat ( sine , 2 ) , & SubFloat ( cosine , 2 ) ) ;
SinCos ( SubFloat ( radians , 3 ) , & SubFloat ( sine , 3 ) , & SubFloat ( cosine , 3 ) ) ;
}
FORCEINLINE fltx4 ArcSinSIMD ( const fltx4 & sine )
{
fltx4 result ;
SubFloat ( result , 0 ) = asin ( SubFloat ( sine , 0 ) ) ;
SubFloat ( result , 1 ) = asin ( SubFloat ( sine , 1 ) ) ;
SubFloat ( result , 2 ) = asin ( SubFloat ( sine , 2 ) ) ;
SubFloat ( result , 3 ) = asin ( SubFloat ( sine , 3 ) ) ;
return result ;
}
FORCEINLINE fltx4 ArcCosSIMD ( const fltx4 & cs )
{
fltx4 result ;
SubFloat ( result , 0 ) = acos ( SubFloat ( cs , 0 ) ) ;
SubFloat ( result , 1 ) = acos ( SubFloat ( cs , 1 ) ) ;
SubFloat ( result , 2 ) = acos ( SubFloat ( cs , 2 ) ) ;
SubFloat ( result , 3 ) = acos ( SubFloat ( cs , 3 ) ) ;
return result ;
}
// tan^1(a/b) .. ie, pass sin in as a and cos in as b
FORCEINLINE fltx4 ArcTan2SIMD ( const fltx4 & a , const fltx4 & b )
{
fltx4 result ;
SubFloat ( result , 0 ) = atan2 ( SubFloat ( a , 0 ) , SubFloat ( b , 0 ) ) ;
SubFloat ( result , 1 ) = atan2 ( SubFloat ( a , 1 ) , SubFloat ( b , 1 ) ) ;
SubFloat ( result , 2 ) = atan2 ( SubFloat ( a , 2 ) , SubFloat ( b , 2 ) ) ;
SubFloat ( result , 3 ) = atan2 ( SubFloat ( a , 3 ) , SubFloat ( b , 3 ) ) ;
return result ;
}
FORCEINLINE fltx4 MaxSIMD ( const fltx4 & a , const fltx4 & b ) // max(a,b)
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = max ( SubFloat ( a , 0 ) , SubFloat ( b , 0 ) ) ;
SubFloat ( retVal , 1 ) = max ( SubFloat ( a , 1 ) , SubFloat ( b , 1 ) ) ;
SubFloat ( retVal , 2 ) = max ( SubFloat ( a , 2 ) , SubFloat ( b , 2 ) ) ;
SubFloat ( retVal , 3 ) = max ( SubFloat ( a , 3 ) , SubFloat ( b , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 MinSIMD ( const fltx4 & a , const fltx4 & b ) // min(a,b)
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = min ( SubFloat ( a , 0 ) , SubFloat ( b , 0 ) ) ;
SubFloat ( retVal , 1 ) = min ( SubFloat ( a , 1 ) , SubFloat ( b , 1 ) ) ;
SubFloat ( retVal , 2 ) = min ( SubFloat ( a , 2 ) , SubFloat ( b , 2 ) ) ;
SubFloat ( retVal , 3 ) = min ( SubFloat ( a , 3 ) , SubFloat ( b , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 AndSIMD ( const fltx4 & a , const fltx4 & b ) // a & b
{
IBINOP ( & ) ;
}
FORCEINLINE fltx4 AndNotSIMD ( const fltx4 & a , const fltx4 & b ) // ~a & b
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = ~ SubInt ( a , 0 ) & SubInt ( b , 0 ) ;
SubInt ( retVal , 1 ) = ~ SubInt ( a , 1 ) & SubInt ( b , 1 ) ;
SubInt ( retVal , 2 ) = ~ SubInt ( a , 2 ) & SubInt ( b , 2 ) ;
SubInt ( retVal , 3 ) = ~ SubInt ( a , 3 ) & SubInt ( b , 3 ) ;
return retVal ;
}
FORCEINLINE fltx4 XorSIMD ( const fltx4 & a , const fltx4 & b ) // a ^ b
{
IBINOP ( ^ ) ;
}
FORCEINLINE fltx4 OrSIMD ( const fltx4 & a , const fltx4 & b ) // a | b
{
IBINOP ( | ) ;
}
FORCEINLINE fltx4 NegSIMD ( const fltx4 & a ) // negate: -a
{
fltx4 retval ;
SubFloat ( retval , 0 ) = - SubFloat ( a , 0 ) ;
SubFloat ( retval , 1 ) = - SubFloat ( a , 1 ) ;
SubFloat ( retval , 2 ) = - SubFloat ( a , 2 ) ;
SubFloat ( retval , 3 ) = - SubFloat ( a , 3 ) ;
return retval ;
}
FORCEINLINE bool IsAllZeros ( const fltx4 & a ) // all floats of a zero?
{
return ( SubFloat ( a , 0 ) = = 0.0 ) & &
( SubFloat ( a , 1 ) = = 0.0 ) & &
( SubFloat ( a , 2 ) = = 0.0 ) & &
( SubFloat ( a , 3 ) = = 0.0 ) ;
}
// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan ( const fltx4 & a , const fltx4 & b )
{
return SubFloat ( a , 0 ) > SubFloat ( b , 0 ) & &
SubFloat ( a , 1 ) > SubFloat ( b , 1 ) & &
SubFloat ( a , 2 ) > SubFloat ( b , 2 ) & &
SubFloat ( a , 3 ) > SubFloat ( b , 3 ) ;
}
// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq ( const fltx4 & a , const fltx4 & b )
{
return SubFloat ( a , 0 ) > = SubFloat ( b , 0 ) & &
SubFloat ( a , 1 ) > = SubFloat ( b , 1 ) & &
SubFloat ( a , 2 ) > = SubFloat ( b , 2 ) & &
SubFloat ( a , 3 ) > = SubFloat ( b , 3 ) ;
}
// For branching if all a.xyzw == b.xyzw
FORCEINLINE bool IsAllEqual ( const fltx4 & a , const fltx4 & b )
{
return SubFloat ( a , 0 ) = = SubFloat ( b , 0 ) & &
SubFloat ( a , 1 ) = = SubFloat ( b , 1 ) & &
SubFloat ( a , 2 ) = = SubFloat ( b , 2 ) & &
SubFloat ( a , 3 ) = = SubFloat ( b , 3 ) ;
}
2022-07-08 00:55:01 +02:00
// For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w
FORCEINLINE bool IsAnyEqual ( const fltx4 & a , const fltx4 & b )
{
return SubFloat ( a , 0 ) = = SubFloat ( b , 0 ) | |
SubFloat ( a , 1 ) = = SubFloat ( b , 1 ) | |
SubFloat ( a , 2 ) = = SubFloat ( b , 2 ) | |
SubFloat ( a , 3 ) = = SubFloat ( b , 3 ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE int TestSignSIMD ( const fltx4 & a ) // mask of which floats have the high bit set
{
int nRet = 0 ;
nRet | = ( SubInt ( a , 0 ) & 0x80000000 ) > > 31 ; // sign(x) -> bit 0
nRet | = ( SubInt ( a , 1 ) & 0x80000000 ) > > 30 ; // sign(y) -> bit 1
nRet | = ( SubInt ( a , 2 ) & 0x80000000 ) > > 29 ; // sign(z) -> bit 2
nRet | = ( SubInt ( a , 3 ) & 0x80000000 ) > > 28 ; // sign(w) -> bit 3
return nRet ;
}
FORCEINLINE bool IsAnyNegative ( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
return ( 0 ! = TestSignSIMD ( a ) ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE bool IsAnyTrue ( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
return ( 0 ! = TestSignSIMD ( a ) ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 CmpEqSIMD ( const fltx4 & a , const fltx4 & b ) // (a==b) ? ~0:0
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = ( SubFloat ( a , 0 ) = = SubFloat ( b , 0 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 1 ) = ( SubFloat ( a , 1 ) = = SubFloat ( b , 1 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 2 ) = ( SubFloat ( a , 2 ) = = SubFloat ( b , 2 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 3 ) = ( SubFloat ( a , 3 ) = = SubFloat ( b , 3 ) ) ? ~ 0 : 0 ;
return retVal ;
}
FORCEINLINE fltx4 CmpGtSIMD ( const fltx4 & a , const fltx4 & b ) // (a>b) ? ~0:0
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = ( SubFloat ( a , 0 ) > SubFloat ( b , 0 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 1 ) = ( SubFloat ( a , 1 ) > SubFloat ( b , 1 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 2 ) = ( SubFloat ( a , 2 ) > SubFloat ( b , 2 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 3 ) = ( SubFloat ( a , 3 ) > SubFloat ( b , 3 ) ) ? ~ 0 : 0 ;
return retVal ;
}
FORCEINLINE fltx4 CmpGeSIMD ( const fltx4 & a , const fltx4 & b ) // (a>=b) ? ~0:0
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = ( SubFloat ( a , 0 ) > = SubFloat ( b , 0 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 1 ) = ( SubFloat ( a , 1 ) > = SubFloat ( b , 1 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 2 ) = ( SubFloat ( a , 2 ) > = SubFloat ( b , 2 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 3 ) = ( SubFloat ( a , 3 ) > = SubFloat ( b , 3 ) ) ? ~ 0 : 0 ;
return retVal ;
}
FORCEINLINE fltx4 CmpLtSIMD ( const fltx4 & a , const fltx4 & b ) // (a<b) ? ~0:0
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = ( SubFloat ( a , 0 ) < SubFloat ( b , 0 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 1 ) = ( SubFloat ( a , 1 ) < SubFloat ( b , 1 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 2 ) = ( SubFloat ( a , 2 ) < SubFloat ( b , 2 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 3 ) = ( SubFloat ( a , 3 ) < SubFloat ( b , 3 ) ) ? ~ 0 : 0 ;
return retVal ;
}
FORCEINLINE fltx4 CmpLeSIMD ( const fltx4 & a , const fltx4 & b ) // (a<=b) ? ~0:0
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = ( SubFloat ( a , 0 ) < = SubFloat ( b , 0 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 1 ) = ( SubFloat ( a , 1 ) < = SubFloat ( b , 1 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 2 ) = ( SubFloat ( a , 2 ) < = SubFloat ( b , 2 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 3 ) = ( SubFloat ( a , 3 ) < = SubFloat ( b , 3 ) ) ? ~ 0 : 0 ;
return retVal ;
}
FORCEINLINE fltx4 CmpInBoundsSIMD ( const fltx4 & a , const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = ( SubFloat ( a , 0 ) < = SubFloat ( b , 0 ) & & SubFloat ( a , 0 ) > = - SubFloat ( b , 0 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 1 ) = ( SubFloat ( a , 1 ) < = SubFloat ( b , 1 ) & & SubFloat ( a , 1 ) > = - SubFloat ( b , 1 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 2 ) = ( SubFloat ( a , 2 ) < = SubFloat ( b , 2 ) & & SubFloat ( a , 2 ) > = - SubFloat ( b , 2 ) ) ? ~ 0 : 0 ;
SubInt ( retVal , 3 ) = ( SubFloat ( a , 3 ) < = SubFloat ( b , 3 ) & & SubFloat ( a , 3 ) > = - SubFloat ( b , 3 ) ) ? ~ 0 : 0 ;
return retVal ;
}
FORCEINLINE fltx4 MaskedAssign ( const fltx4 & ReplacementMask , const fltx4 & NewValue , const fltx4 & OldValue )
{
return OrSIMD (
AndSIMD ( ReplacementMask , NewValue ) ,
AndNotSIMD ( ReplacementMask , OldValue ) ) ;
}
FORCEINLINE fltx4 ReplicateX4 ( float flValue ) // a,a,a,a
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = flValue ;
SubFloat ( retVal , 1 ) = flValue ;
SubFloat ( retVal , 2 ) = flValue ;
SubFloat ( retVal , 3 ) = flValue ;
return retVal ;
}
/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE fltx4 ReplicateIX4 ( int nValue )
{
fltx4 retVal ;
SubInt ( retVal , 0 ) = nValue ;
SubInt ( retVal , 1 ) = nValue ;
SubInt ( retVal , 2 ) = nValue ;
SubInt ( retVal , 3 ) = nValue ;
return retVal ;
}
// Round towards positive infinity
FORCEINLINE fltx4 CeilSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = ceil ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = ceil ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = ceil ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = ceil ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
// Round towards negative infinity
FORCEINLINE fltx4 FloorSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = floor ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = floor ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = floor ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = floor ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 SqrtEstSIMD ( const fltx4 & a ) // sqrt(a), more or less
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = sqrt ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = sqrt ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = sqrt ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = sqrt ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 SqrtSIMD ( const fltx4 & a ) // sqrt(a)
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = sqrt ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = sqrt ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = sqrt ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = sqrt ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD ( const fltx4 & a ) // 1/sqrt(a), more or less
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / sqrt ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = 1.0 / sqrt ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = 1.0 / sqrt ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = 1.0 / sqrt ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / sqrt ( SubFloat ( a , 0 ) ! = 0.0f ? SubFloat ( a , 0 ) : FLT_EPSILON ) ;
SubFloat ( retVal , 1 ) = 1.0 / sqrt ( SubFloat ( a , 1 ) ! = 0.0f ? SubFloat ( a , 1 ) : FLT_EPSILON ) ;
SubFloat ( retVal , 2 ) = 1.0 / sqrt ( SubFloat ( a , 2 ) ! = 0.0f ? SubFloat ( a , 2 ) : FLT_EPSILON ) ;
SubFloat ( retVal , 3 ) = 1.0 / sqrt ( SubFloat ( a , 3 ) ! = 0.0f ? SubFloat ( a , 3 ) : FLT_EPSILON ) ;
return retVal ;
}
FORCEINLINE fltx4 ReciprocalSqrtSIMD ( const fltx4 & a ) // 1/sqrt(a)
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / sqrt ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = 1.0 / sqrt ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = 1.0 / sqrt ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = 1.0 / sqrt ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 ReciprocalEstSIMD ( const fltx4 & a ) // 1/a, more or less
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / SubFloat ( a , 0 ) ;
SubFloat ( retVal , 1 ) = 1.0 / SubFloat ( a , 1 ) ;
SubFloat ( retVal , 2 ) = 1.0 / SubFloat ( a , 2 ) ;
SubFloat ( retVal , 3 ) = 1.0 / SubFloat ( a , 3 ) ;
return retVal ;
}
FORCEINLINE fltx4 ReciprocalSIMD ( const fltx4 & a ) // 1/a
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / SubFloat ( a , 0 ) ;
SubFloat ( retVal , 1 ) = 1.0 / SubFloat ( a , 1 ) ;
SubFloat ( retVal , 2 ) = 1.0 / SubFloat ( a , 2 ) ;
SubFloat ( retVal , 3 ) = 1.0 / SubFloat ( a , 3 ) ;
return retVal ;
}
/// 1/x for all 4 values.
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / ( SubFloat ( a , 0 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = 1.0 / ( SubFloat ( a , 1 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = 1.0 / ( SubFloat ( a , 2 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = 1.0 / ( SubFloat ( a , 3 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 ReciprocalSaturateSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / ( SubFloat ( a , 0 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = 1.0 / ( SubFloat ( a , 1 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = 1.0 / ( SubFloat ( a , 2 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = 1.0 / ( SubFloat ( a , 3 ) = = 0.0f ? FLT_EPSILON : SubFloat ( a , 3 ) ) ;
return retVal ;
}
// 2^x for all values (the antilog)
FORCEINLINE fltx4 ExpSIMD ( const fltx4 & toPower )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = powf ( 2 , SubFloat ( toPower , 0 ) ) ;
SubFloat ( retVal , 1 ) = powf ( 2 , SubFloat ( toPower , 1 ) ) ;
SubFloat ( retVal , 2 ) = powf ( 2 , SubFloat ( toPower , 2 ) ) ;
SubFloat ( retVal , 3 ) = powf ( 2 , SubFloat ( toPower , 3 ) ) ;
return retVal ;
}
FORCEINLINE fltx4 Dot3SIMD ( const fltx4 & a , const fltx4 & b )
{
float flDot = SubFloat ( a , 0 ) * SubFloat ( b , 0 ) +
SubFloat ( a , 1 ) * SubFloat ( b , 1 ) +
SubFloat ( a , 2 ) * SubFloat ( b , 2 ) ;
return ReplicateX4 ( flDot ) ;
}
FORCEINLINE fltx4 Dot4SIMD ( const fltx4 & a , const fltx4 & b )
{
float flDot = SubFloat ( a , 0 ) * SubFloat ( b , 0 ) +
SubFloat ( a , 1 ) * SubFloat ( b , 1 ) +
SubFloat ( a , 2 ) * SubFloat ( b , 2 ) +
SubFloat ( a , 3 ) * SubFloat ( b , 3 ) ;
return ReplicateX4 ( flDot ) ;
}
// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD ( FLTX4 in , FLTX4 min , FLTX4 max )
{
return MaxSIMD ( min , MinSIMD ( max , in ) ) ;
}
// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD ( const fltx4 & a )
{
fltx4 retval ;
retval = a ;
SubFloat ( retval , 0 ) = 0 ;
return retval ;
}
FORCEINLINE fltx4 LoadUnalignedSIMD ( const void * pSIMD )
{
return * ( reinterpret_cast < const fltx4 * > ( pSIMD ) ) ;
}
FORCEINLINE fltx4 LoadUnaligned3SIMD ( const void * pSIMD )
{
return * ( reinterpret_cast < const fltx4 * > ( pSIMD ) ) ;
}
2022-07-08 00:55:01 +02:00
// load a single unaligned float into the x component of a SIMD word
FORCEINLINE fltx4 LoadUnalignedFloatSIMD ( const float * pFlt )
{
fltx4 retval ;
SubFloat ( retval , 0 ) = * pFlt ;
return retval ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 LoadAlignedSIMD ( const void * pSIMD )
{
return * ( reinterpret_cast < const fltx4 * > ( pSIMD ) ) ;
}
// for the transitional class -- load a 3-by VectorAligned and squash its w component
FORCEINLINE fltx4 LoadAlignedSIMD ( const VectorAligned & pSIMD )
{
fltx4 retval = LoadAlignedSIMD ( pSIMD . Base ( ) ) ;
// squelch w
SubInt ( retval , 3 ) = 0 ;
return retval ;
}
2022-07-08 00:55:01 +02:00
// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
FORCEINLINE fltx4 LoadGatherSIMD ( const float & x , const float & y , const float & z , const float & w )
{
fltx4 retval = { x , y , z , w } ;
return retval ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE void StoreAlignedSIMD ( float * pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < fltx4 * > ( pSIMD ) ) = a ;
}
FORCEINLINE void StoreUnalignedSIMD ( float * pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < fltx4 * > ( pSIMD ) ) = a ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreUnalignedFloat ( float * pSingleFloat , const fltx4 & a )
{
* pSingleFloat = SubFloat ( a , 0 ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE void StoreUnaligned3SIMD ( float * pSIMD , const fltx4 & a )
{
* pSIMD = SubFloat ( a , 0 ) ;
* ( pSIMD + 1 ) = SubFloat ( a , 1 ) ;
* ( pSIMD + 2 ) = SubFloat ( a , 2 ) ;
}
2022-07-08 00:55:01 +02:00
2022-07-06 21:11:32 +02:00
// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
FORCEINLINE void StoreAligned3SIMD ( VectorAligned * RESTRICT pSIMD , const fltx4 & a )
{
StoreAlignedSIMD ( pSIMD - > Base ( ) , a ) ;
}
2022-07-08 00:55:01 +02:00
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination[0], pDestination[1], pDestination[2], pDestination[3]
// The Vectors are assumed to be unaligned.
FORCEINLINE void StoreFourUnalignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
2023-01-31 14:31:17 +01:00
Vector3D * const pDestination )
2022-07-08 00:55:01 +02:00
{
StoreUnaligned3SIMD ( pDestination - > Base ( ) , a ) ;
StoreUnaligned3SIMD ( ( pDestination + 1 ) - > Base ( ) , b ) ;
StoreUnaligned3SIMD ( ( pDestination + 2 ) - > Base ( ) , c ) ;
StoreUnaligned3SIMD ( ( pDestination + 3 ) - > Base ( ) , d ) ;
}
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination , pDestination + 1, pDestination + 2, pDestination + 3
// The Vectors are assumed to start on an ALIGNED address, that is,
2022-09-09 19:47:31 +02:00
// pDestination is 16-byte aligned (though obviously pDestination+1 is not).
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreFourAlignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
2023-01-31 14:31:17 +01:00
Vector3D * const pDestination )
2022-07-08 00:55:01 +02:00
{
StoreUnaligned3SIMD ( pDestination - > Base ( ) , a ) ;
StoreUnaligned3SIMD ( ( pDestination + 1 ) - > Base ( ) , b ) ;
StoreUnaligned3SIMD ( ( pDestination + 2 ) - > Base ( ) , c ) ;
StoreUnaligned3SIMD ( ( pDestination + 3 ) - > Base ( ) , d ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE void TransposeSIMD ( fltx4 & x , fltx4 & y , fltx4 & z , fltx4 & w )
{
# define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; }
SWAP_FLOATS ( x , 1 , y , 0 ) ;
SWAP_FLOATS ( x , 2 , z , 0 ) ;
SWAP_FLOATS ( x , 3 , w , 0 ) ;
SWAP_FLOATS ( y , 2 , z , 1 ) ;
SWAP_FLOATS ( y , 3 , w , 1 ) ;
SWAP_FLOATS ( z , 3 , w , 2 ) ;
}
// find the lowest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
FORCEINLINE fltx4 FindLowestSIMD3 ( const fltx4 & a )
{
float lowest = min ( min ( SubFloat ( a , 0 ) , SubFloat ( a , 1 ) ) , SubFloat ( a , 2 ) ) ;
return ReplicateX4 ( lowest ) ;
}
// find the highest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
FORCEINLINE fltx4 FindHighestSIMD3 ( const fltx4 & a )
{
float highest = max ( max ( SubFloat ( a , 0 ) , SubFloat ( a , 1 ) ) , SubFloat ( a , 2 ) ) ;
return ReplicateX4 ( highest ) ;
}
// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD ( intx4 * RESTRICT pDest , const fltx4 & vSrc )
{
( * pDest ) [ 0 ] = SubFloat ( vSrc , 0 ) ;
( * pDest ) [ 1 ] = SubFloat ( vSrc , 1 ) ;
( * pDest ) [ 2 ] = SubFloat ( vSrc , 2 ) ;
( * pDest ) [ 3 ] = SubFloat ( vSrc , 3 ) ;
}
// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------
// splat all components of a vector to a signed immediate int number.
FORCEINLINE fltx4 IntSetImmediateSIMD ( int nValue )
{
fltx4 retval ;
SubInt ( retval , 0 ) = SubInt ( retval , 1 ) = SubInt ( retval , 2 ) = SubInt ( retval , 3 ) = nValue ;
return retval ;
}
// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD ( const void * RESTRICT pSIMD )
{
return * ( reinterpret_cast < const i32x4 * > ( pSIMD ) ) ;
}
// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD ( const void * RESTRICT pSIMD )
{
return * ( reinterpret_cast < const i32x4 * > ( pSIMD ) ) ;
}
// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD ( int32 * pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < i32x4 * > ( pSIMD ) ) = a ;
}
FORCEINLINE void StoreAlignedIntSIMD ( intx4 & pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < i32x4 * > ( pSIMD . Base ( ) ) ) = a ;
}
FORCEINLINE void StoreUnalignedIntSIMD ( int32 * pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < i32x4 * > ( pSIMD ) ) = a ;
}
2022-07-08 00:55:01 +02:00
// Load four consecutive uint16's, and turn them into floating point numbers.
// This function isn't especially fast and could be made faster if anyone is
// using it heavily.
FORCEINLINE fltx4 LoadAndConvertUint16SIMD ( const uint16 * pInts )
{
fltx4 retval ;
SubFloat ( retval , 0 ) = pInts [ 0 ] ;
SubFloat ( retval , 1 ) = pInts [ 1 ] ;
SubFloat ( retval , 2 ) = pInts [ 2 ] ;
SubFloat ( retval , 3 ) = pInts [ 3 ] ;
}
2022-07-06 21:11:32 +02:00
// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD ( const u32x4 & vSrcA )
{
Assert ( 0 ) ; /* pc has no such operation */
fltx4 retval ;
2022-07-08 00:55:01 +02:00
SubFloat ( retval , 0 ) = ( ( float ) SubInt ( vSrcA , 0 ) ) ;
SubFloat ( retval , 1 ) = ( ( float ) SubInt ( vSrcA , 1 ) ) ;
SubFloat ( retval , 2 ) = ( ( float ) SubInt ( vSrcA , 2 ) ) ;
SubFloat ( retval , 3 ) = ( ( float ) SubInt ( vSrcA , 3 ) ) ;
2022-07-06 21:11:32 +02:00
return retval ;
}
#if 0 /* pc has no such op */
// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const i32x4 & vSrcA )
{
fltx4 retval ;
SubFloat ( retval , 0 ) = ( ( float ) ( reinterpret_cast < int32 * > ( & vSrcA . m128_s32 [ 0 ] ) ) ) ;
SubFloat ( retval , 1 ) = ( ( float ) ( reinterpret_cast < int32 * > ( & vSrcA . m128_s32 [ 1 ] ) ) ) ;
SubFloat ( retval , 2 ) = ( ( float ) ( reinterpret_cast < int32 * > ( & vSrcA . m128_s32 [ 2 ] ) ) ) ;
SubFloat ( retval , 3 ) = ( ( float ) ( reinterpret_cast < int32 * > ( & vSrcA . m128_s32 [ 3 ] ) ) ) ;
return retval ;
}
/*
works on fltx4 ' s as if they are four uints .
the first parameter contains the words to be shifted ,
the second contains the amount to shift by AS INTS
for i = 0 to 3
shift = vSrcB_i * 32 : ( i * 32 ) + 4
vReturned_i * 32 : ( i * 32 ) + 31 = vSrcA_i * 32 : ( i * 32 ) + 31 < < shift
*/
FORCEINLINE i32x4 IntShiftLeftWordSIMD ( const i32x4 & vSrcA , const i32x4 & vSrcB )
{
i32x4 retval ;
SubInt ( retval , 0 ) = SubInt ( vSrcA , 0 ) < < SubInt ( vSrcB , 0 ) ;
SubInt ( retval , 1 ) = SubInt ( vSrcA , 1 ) < < SubInt ( vSrcB , 1 ) ;
SubInt ( retval , 2 ) = SubInt ( vSrcA , 2 ) < < SubInt ( vSrcB , 2 ) ;
SubInt ( retval , 3 ) = SubInt ( vSrcA , 3 ) < < SubInt ( vSrcB , 3 ) ;
return retval ;
}
2022-07-08 00:55:01 +02:00
2022-07-06 21:11:32 +02:00
# endif
2022-07-08 00:55:01 +02:00
# elif ( defined( _PS3 ) )
# define SN_IMPROVED_INTRINSICS ( (( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )) ||\
( defined ( __SN_VER__ ) & & ( __SN_VER__ > 25002 ) ) )
2022-07-06 21:11:32 +02:00
//---------------------------------------------------------------------
2022-07-08 00:55:01 +02:00
// PS3 implementation
2022-07-06 21:11:32 +02:00
//---------------------------------------------------------------------
2022-07-08 00:55:01 +02:00
FORCEINLINE float FloatSIMD ( fltx4 & a , int idx )
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
# if SN_IMPROVED_INTRINSICS
return vec_extract ( a , idx ) ;
# else
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxf ) ;
2022-07-06 21:11:32 +02:00
return a_union . m128_f32 [ idx ] ;
2022-07-08 00:55:01 +02:00
# endif
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
FORCEINLINE unsigned int UIntSIMD ( u32x4 & a , int idx )
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
# if SN_IMPROVED_INTRINSICS
return vec_extract ( a , idx ) ;
# else
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxui ) ;
2022-07-06 21:11:32 +02:00
return a_union . m128_u32 [ idx ] ;
2022-07-08 00:55:01 +02:00
# endif
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 AddSIMD ( const fltx4 & a , const fltx4 & b )
{
2022-07-08 00:55:01 +02:00
return vec_add ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 SubSIMD ( const fltx4 & a , const fltx4 & b ) // a-b
{
2022-07-08 00:55:01 +02:00
return vec_sub ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 MulSIMD ( const fltx4 & a , const fltx4 & b ) // a*b
{
2022-07-08 00:55:01 +02:00
return vec_madd ( a , b , _VEC_ZEROF ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 MaddSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // a*b + c
{
2022-07-08 00:55:01 +02:00
return vec_madd ( a , b , c ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 MsubSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // c - a*b
{
2022-07-08 00:55:01 +02:00
return vec_nmsub ( a , b , c ) ;
2022-07-06 21:11:32 +02:00
} ;
FORCEINLINE fltx4 Dot3SIMD ( const fltx4 & a , const fltx4 & b )
{
2022-07-08 00:55:01 +02:00
// oliviern: it seems that this code could be optimized
// (or maybe the latency will slow down if there is nothing to put in between)
// Something like that (to verify on PS3 and SPU):
// result2 = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3
// result = vec_add(vec_sld(result2, result2, 4), result2); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0)
// result = vec_add(vec_sld(result2, result2, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2), (a1 * b1) + (a2 * b2) + (a3 * b3), (a2 * b2) + (a3 * b3) + (a0 * b0), (a3 * b3) + (a0 * b0) + ...
// result = vec_splat(result, 0); // DotProduct3...
// 6 SIMD instructions instead of 8 (but again with potentially one more latency - it depends if other stuff can be interleaved in between).
// It may still be a bit faster in the worst case.
fltx4 result ;
result = vec_madd ( a , b , _VEC_ZEROF ) ;
result = vec_madd ( vec_sld ( a , a , 4 ) , vec_sld ( b , b , 4 ) , result ) ;
result = vec_madd ( vec_sld ( a , a , 8 ) , vec_sld ( b , b , 8 ) , result ) ;
// replicate across all
result = vec_splat ( result , 0 ) ;
return result ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 Dot4SIMD ( const fltx4 & a , const fltx4 & b )
{
2022-07-08 00:55:01 +02:00
// See comment in Dot3SIMD, we could reduce to 6 SIMD instructions instead of 7 (but again with potentially one more latency).
// result = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3
// result = vec_add(vec_sld(result, result, 4), result); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0)
// result = vec_add(vec_sld(result, result, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2) + (a3 * b3), ...
// result = vec_splat(result, 0); // DotProduct3...
// 6 SIMD instructions instead of 7 (but again with potentially one more latency - it depends if other stuff can be interleaved in between).
// It may be a wash in the worst case.
fltx4 result ;
result = vec_madd ( a , b , _VEC_ZEROF ) ;
result = vec_madd ( vec_sld ( a , a , 4 ) , vec_sld ( b , b , 4 ) , result ) ;
result = vec_add ( vec_sld ( result , result , 8 ) , result ) ;
// replicate across all
result = vec_splat ( result , 0 ) ;
return result ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 SinSIMD ( const fltx4 & radians )
{
2022-07-08 00:55:01 +02:00
return sinf4 ( radians ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE void SinCos3SIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians )
{
2022-07-08 00:55:01 +02:00
sincosf4 ( radians , & sine , & cosine ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
FORCEINLINE void SinCosSIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians ) // a*b + c
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
sincosf4 ( radians , & sine , & cosine ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 ArcCosSIMD ( const fltx4 & cs )
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
return acosf4 ( cs ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 ArcTan2SIMD ( const fltx4 & a , const fltx4 & b )
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
return atan2f4 ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 ArcSinSIMD ( const fltx4 & sine )
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
return asinf4 ( sine ) ;
2022-07-06 21:11:32 +02:00
}
// DivSIMD defined further down, since it uses ReciprocalSIMD
FORCEINLINE fltx4 MaxSIMD ( const fltx4 & a , const fltx4 & b ) // max(a,b)
{
2022-07-08 00:55:01 +02:00
return vec_max ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 MinSIMD ( const fltx4 & a , const fltx4 & b ) // min(a,b)
{
2022-07-08 00:55:01 +02:00
return vec_min ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 AndSIMD ( const fltx4 & a , const fltx4 & b ) // a & b
{
2022-07-08 00:55:01 +02:00
return vec_and ( a , b ) ;
}
FORCEINLINE fltx4 AndSIMD ( const bi32x4 & a , const fltx4 & b ) // a & b
{
return vec_and ( ( fltx4 ) a , b ) ;
}
FORCEINLINE fltx4 AndSIMD ( const fltx4 & a , const bi32x4 & b ) // a & b
{
return vec_and ( a , ( fltx4 ) b ) ;
}
FORCEINLINE bi32x4 AndSIMD ( const bi32x4 & a , const bi32x4 & b ) // a & b
{
return vec_and ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
#if 0
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 AndNotSIMD ( const fltx4 & a , const fltx4 & b ) // ~a & b
{
// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
2022-07-08 00:55:01 +02:00
return vec_andc ( b , a ) ;
}
FORCEINLINE fltx4 AndNotSIMD ( const bi32x4 & a , const fltx4 & b ) // ~a & b
{
// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
return vec_andc ( b , ( fltx4 ) a ) ;
}
FORCEINLINE fltx4 AndNotSIMD ( const fltx4 & a , const bi32x4 & b ) // ~a & b
{
// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
return ( fltx4 ) vec_andc ( b , ( bi32x4 ) a ) ;
}
FORCEINLINE bi32x4 AndNotSIMD ( const bi32x4 & a , const bi32x4 & b ) // ~a & b
{
// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
return vec_andc ( b , a ) ;
}
# else
template < typename T , typename U >
FORCEINLINE T AndNotSIMD ( const T & a , const U & b ) // ~a & b
{
return vec_andc ( b , ( T ) a ) ;
}
// specialize for the case of bi, flt
FORCEINLINE fltx4 AndNotSIMD ( const bi32x4 & a , const fltx4 & b ) // ~a & b
{
return vec_andc ( b , ( fltx4 ) a ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
# endif
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 XorSIMD ( const fltx4 & a , const fltx4 & b ) // a ^ b
{
2022-07-08 00:55:01 +02:00
return vec_xor ( a , b ) ;
}
FORCEINLINE fltx4 XorSIMD ( const bi32x4 & a , const fltx4 & b ) // a ^ b
{
return vec_xor ( ( fltx4 ) a , b ) ;
}
FORCEINLINE fltx4 XorSIMD ( const fltx4 & a , const bi32x4 & b ) // a ^ b
{
return vec_xor ( a , ( fltx4 ) b ) ;
}
FORCEINLINE bi32x4 XorSIMD ( const bi32x4 & a , const bi32x4 & b ) // a ^ b
{
return vec_xor ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 OrSIMD ( const fltx4 & a , const fltx4 & b ) // a | b
{
2022-07-08 00:55:01 +02:00
return vec_or ( a , b ) ;
}
FORCEINLINE fltx4 OrSIMD ( const bi32x4 & a , const fltx4 & b ) // a | b
{
return vec_or ( ( fltx4 ) a , b ) ;
}
FORCEINLINE fltx4 OrSIMD ( const fltx4 & a , const bi32x4 & b ) // a | b
{
return vec_or ( a , ( fltx4 ) b ) ;
}
FORCEINLINE i32x4 OrSIMD ( const i32x4 & a , const i32x4 & b ) // a | b
{
return vec_or ( a , b ) ;
}
FORCEINLINE u32x4 OrSIMD ( const u32x4 & a , const u32x4 & b ) // a | b
{
return vec_or ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
# if !defined(__SPU__) // bi32x4 typedef to same as u32x4 on SPU
FORCEINLINE bi32x4 OrSIMD ( const bi32x4 & a , const bi32x4 & b ) // a | b
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
return vec_or ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
# endif
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 NegSIMD ( const fltx4 & a ) // negate: -a
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
return ( SubSIMD ( _VEC_ZEROF , a ) ) ;
// untested
// vec_float4 signMask;
// vec_float4 result;
// signMask = vec_splat_s32(-1);
// signMask = vec_sll(signMask, signMask);
// result = vec_xor(a, signMask);
// return result;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE bool IsAnyZeros ( const fltx4 & a ) // any floats are zero?
{
2022-07-08 00:55:01 +02:00
return vec_any_eq ( a , _VEC_ZEROF ) ;
}
FORCEINLINE bool IsAnyZeros ( const bi32x4 & a ) // any floats are zero?
{
return vec_any_eq ( ( u32x4 ) a , _VEC_ZERO ) ;
}
FORCEINLINE bool IsAllZeros ( const bi32x4 & a ) // all floats of a zero?
{
return vec_all_eq ( ( u32x4 ) a , _VEC_ZERO ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE bool IsAnyXYZZero ( const fltx4 & a ) // are any of x,y,z zero?
{
2022-07-08 00:55:01 +02:00
# if SN_IMPROVED_INTRINSICS
// push 1.0 into w (NON-ZERO)
fltx4 b = vec_insert ( 1.0f , a , 3 ) ;
return vec_any_eq ( b , _VEC_ZEROF ) ;
# else
fltx4 b = vec_perm ( a , _VEC_ONEF , _VEC_PERMUTE_XYZ0W1 ) ;
return vec_any_eq ( b , _VEC_ZEROF ) ;
# endif
2022-07-06 21:11:32 +02:00
}
// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan ( const fltx4 & a , const fltx4 & b )
{
2022-07-08 00:55:01 +02:00
return vec_all_gt ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq ( const fltx4 & a , const fltx4 & b )
{
2022-07-08 00:55:01 +02:00
return vec_all_ge ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE bool IsAllEqual ( const fltx4 & a , const fltx4 & b )
{
2022-07-08 00:55:01 +02:00
return vec_all_eq ( a , b ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE int TestSignSIMD ( const fltx4 & a ) // mask of which floats have the high bit set
{
// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
int nRet = 0 ;
2022-07-08 00:55:01 +02:00
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxf ) ;
nRet | = ( a_union . m128_u32 [ 0 ] & 0x80000000 ) > > 31 ; // sign(x) -> bit 0
nRet | = ( a_union . m128_u32 [ 1 ] & 0x80000000 ) > > 30 ; // sign(y) -> bit 1
nRet | = ( a_union . m128_u32 [ 2 ] & 0x80000000 ) > > 29 ; // sign(z) -> bit 2
nRet | = ( a_union . m128_u32 [ 3 ] & 0x80000000 ) > > 28 ; // sign(w) -> bit 3
return nRet ;
}
FORCEINLINE int TestSignSIMD ( const bi32x4 & a ) // mask of which floats have the high bit set
{
// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
int nRet = 0 ;
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxbi ) ;
2022-07-06 21:11:32 +02:00
nRet | = ( a_union . m128_u32 [ 0 ] & 0x80000000 ) > > 31 ; // sign(x) -> bit 0
nRet | = ( a_union . m128_u32 [ 1 ] & 0x80000000 ) > > 30 ; // sign(y) -> bit 1
nRet | = ( a_union . m128_u32 [ 2 ] & 0x80000000 ) > > 29 ; // sign(z) -> bit 2
nRet | = ( a_union . m128_u32 [ 3 ] & 0x80000000 ) > > 28 ; // sign(w) -> bit 3
return nRet ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE bool IsAnyNegative ( const bi32x4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
return ( 0 ! = TestSignSIMD ( a ) ) ;
}
2022-07-06 21:11:32 +02:00
// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD ( const fltx4 & a )
{
2022-07-08 00:55:01 +02:00
return ( fltx4 ) vec_and ( ( u32x4 ) a , _VEC_CLEAR_WMASK ) ;
}
FORCEINLINE bi32x4 SetWToZeroSIMD ( const bi32x4 & a )
{
return ( bi32x4 ) vec_and ( ( u32x4 ) a , _VEC_CLEAR_WMASK ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE bool IsAnyNegative ( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
// NOTE: this tests the top bits of each vector element using integer math
// (so it ignores NaNs - it will return true for "-NaN")
2022-07-08 00:55:01 +02:00
return vec_any_lt ( a , _VEC_ZEROF ) ;
}
FORCEINLINE bool IsAnyTrue ( const fltx4 & a )
{
return vec_any_ne ( a , _VEC_ZEROF ) ;
}
# ifdef DIFFERENT_NATIVE_VECTOR_TYPES
FORCEINLINE bool IsAnyTrue ( const bi32x4 & a )
{
return vec_any_ne ( ( vector unsigned int ) a , _VEC_0L ) ;
}
# endif
FORCEINLINE bi32x4 CmpEqSIMD ( const fltx4 & a , const fltx4 & b ) // (a==b) ? ~0:0
{
return ( bi32x4 ) vec_cmpeq ( a , b ) ;
}
FORCEINLINE bi32x4 CmpEqSIMD ( const i32x4 & a , const i32x4 & b ) // (a==b) ? ~0:0
{
return ( bi32x4 ) vec_cmpeq ( a , b ) ;
}
FORCEINLINE bi32x4 CmpEqSIMD ( const u32x4 & a , const u32x4 & b ) // (a==b) ? ~0:0
{
return ( bi32x4 ) vec_cmpeq ( a , b ) ;
}
FORCEINLINE bi32x4 CmpGtSIMD ( const fltx4 & a , const fltx4 & b ) // (a>b) ? ~0:0
{
return ( bi32x4 ) vec_cmpgt ( a , b ) ;
}
FORCEINLINE bi32x4 CmpGtSIMD ( const i32x4 & a , const i32x4 & b ) // (a>b) ? ~0:0
{
return ( bi32x4 ) vec_cmpgt ( a , b ) ;
}
FORCEINLINE bi32x4 CmpGtSIMD ( const u32x4 & a , const u32x4 & b ) // (a>b) ? ~0:0
{
return ( bi32x4 ) vec_cmpgt ( a , b ) ;
}
FORCEINLINE bi32x4 CmpGeSIMD ( const fltx4 & a , const fltx4 & b ) // (a>=b) ? ~0:0
{
return ( bi32x4 ) vec_cmpge ( a , b ) ;
}
FORCEINLINE bi32x4 CmpLtSIMD ( const fltx4 & a , const fltx4 & b ) // (a<b) ? ~0:0
{
return ( bi32x4 ) vec_cmplt ( a , b ) ;
}
FORCEINLINE bi32x4 CmpLeSIMD ( const fltx4 & a , const fltx4 & b ) // (a<=b) ? ~0:0
{
return ( bi32x4 ) vec_cmple ( a , b ) ;
}
FORCEINLINE bi32x4 CmpInBoundsSIMD ( const fltx4 & a , const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
{
i32x4 control ;
control = vec_cmpb ( a , b ) ;
return ( bi32x4 ) vec_cmpeq ( ( u32x4 ) control , _VEC_ZERO ) ;
}
FORCEINLINE int CmpAnyLeSIMD ( const fltx4 & a , const fltx4 & b )
{
return vec_any_le ( a , b ) ;
}
FORCEINLINE int CmpAnyGeSIMD ( const fltx4 & a , const fltx4 & b )
{
return vec_any_ge ( a , b ) ;
}
FORCEINLINE int CmpAnyLtSIMD ( const fltx4 & a , const fltx4 & b )
{
return vec_any_lt ( a , b ) ;
}
FORCEINLINE int CmpAnyLtSIMD ( const bi32x4 & a , const i32x4 & b )
{
return vec_any_lt ( ( i32x4 ) a , b ) ;
}
FORCEINLINE int CmpAnyGtSIMD ( const fltx4 & a , const fltx4 & b )
{
return vec_any_gt ( a , b ) ;
}
FORCEINLINE int CmpAnyNeSIMD ( const fltx4 & a , const fltx4 & b )
{
return vec_any_ne ( a , b ) ;
}
FORCEINLINE int CmpAnyNeSIMD ( const bi32x4 & a , const bi32x4 & b )
{
return vec_any_ne ( a , b ) ;
}
FORCEINLINE int CmpAnyNeSIMD ( const bi32x4 & a , const i32x4 & b )
{
return vec_any_ne ( a , ( bi32x4 ) b ) ;
}
FORCEINLINE int CmpAllLeSIMD ( const fltx4 & a , const fltx4 & b )
{
return vec_all_le ( a , b ) ;
}
FORCEINLINE fltx4 MaskedAssign ( const bi32x4 & ReplacementMask , const fltx4 & NewValue , const fltx4 & OldValue )
{
return vec_sel ( OldValue , NewValue , ReplacementMask ) ;
}
FORCEINLINE fltx4 MaskedAssign ( const fltx4 & ReplacementMask , const fltx4 & NewValue , const fltx4 & OldValue )
{
return vec_sel ( OldValue , NewValue , ( const bi32x4 ) ReplacementMask ) ;
}
FORCEINLINE vector signed short MaskedAssign ( const vector unsigned short & ReplacementMask , const vector signed short & NewValue , const vector signed short & OldValue )
{
return vec_sel ( OldValue , NewValue , ReplacementMask ) ;
}
// AKA "Broadcast", "Splat"
FORCEINLINE fltx4 ReplicateX4 ( float flValue ) // a,a,a,a
{
# if SN_IMPROVED_INTRINSICS
return vec_splats ( flValue ) ;
# else
// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
float * pValue = & flValue ;
Assert ( pValue ) ;
Assert ( ( ( unsigned int ) pValue & 3 ) = = 0 ) ;
fltx4 result ;
result = vec_ld ( 0 , pValue ) ;
result = vec_splat ( vec_perm ( result , result , vec_lvsl ( 0 , pValue ) ) , 0 ) ;
return result ;
# endif
}
FORCEINLINE fltx4 ReplicateX4 ( const float * pValue ) // a,a,a,a
{
# if SN_IMPROVED_INTRINSICS
return vec_splats ( * pValue ) ;
# else
Assert ( pValue ) ;
fltx4 result ;
result = vec_ld ( 0 , pValue ) ;
result = vec_splat ( vec_perm ( result , result , vec_lvsl ( 0 , pValue ) ) , 0 ) ;
return result ;
# endif
}
/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE i32x4 ReplicateIX4 ( int nValue )
{
# if SN_IMPROVED_INTRINSICS
return vec_splats ( nValue ) ;
# else
// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
int * pValue = & nValue ;
Assert ( pValue ) ;
Assert ( ( ( unsigned int ) pValue & 3 ) = = 0 ) ;
i32x4 result ;
result = vec_ld ( 0 , pValue ) ;
result = vec_splat ( vec_perm ( result , result , vec_lvsl ( 0 , pValue ) ) , 0 ) ;
return result ;
# endif
}
FORCEINLINE fltx4 SqrtSIMD ( const fltx4 & a ) // sqrt(a)
{
return sqrtf4 ( a ) ;
}
FORCEINLINE fltx4 SqrtEstSIMD ( const fltx4 & a ) // sqrt(a), more or less
{
# if defined( _PS3 ) && !defined( SPU )
// This is exactly what the Xbox 360 does in XMVectorSqrtEst
fltx4 vRecipSquareRoot = vec_rsqrte ( a ) ;
i32x4 vOne = vec_splat_s32 ( 1 ) ;
i32x4 vAllOnes = vec_splat_s32 ( - 1 ) ;
i32x4 vShiftLeft24 = vec_splat_s32 ( - 8 ) ; // -8 is the same bit pattern as 24 with a 5-bit mask
fltx4 vZero = ( fltx4 ) vec_splat_s32 ( 0 ) ;
u32x4 vInputShifted = vec_sl ( ( u32x4 ) a , ( u32x4 ) vOne ) ;
u32x4 vInfinityShifted = vec_sl ( ( u32x4 ) vAllOnes , ( u32x4 ) vShiftLeft24 ) ;
bi32x4 vEqualsZero = vec_vcmpeqfp ( a , vZero ) ;
bi32x4 vEqualsInfinity = vec_vcmpequw ( vInputShifted , vInfinityShifted ) ;
fltx4 vSquareRoot = vec_madd ( a , vRecipSquareRoot , _VEC_ZEROF ) ;
bi32x4 vResultMask = vec_vcmpequw ( ( u32x4 ) vEqualsInfinity , ( u32x4 ) vEqualsZero ) ; // mask has 1s wherever the square root is valid
fltx4 vCorrectedSquareRoot = vec_sel ( a , vSquareRoot , vResultMask ) ;
return vCorrectedSquareRoot ;
# else
return SqrtSIMD ( a ) ;
# endif
}
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD ( const fltx4 & a ) // 1/sqrt(a), more or less
{
return vec_rsqrte ( a ) ;
}
FORCEINLINE fltx4 ReciprocalSqrtSIMD ( const fltx4 & a ) // 1/sqrt(a)
{
// This matches standard library function rsqrtf4
fltx4 result ;
vmathV4RsqrtPerElem ( ( VmathVector4 * ) & result , ( const VmathVector4 * ) & a ) ;
return result ;
}
FORCEINLINE fltx4 ReciprocalEstSIMD ( const fltx4 & a ) // 1/a, more or less
{
return vec_re ( a ) ;
}
/// 1/x for all 4 values, more or less
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD ( const fltx4 & a )
{
bi32x4 zero_mask = CmpEqSIMD ( a , Four_Zeros ) ;
fltx4 ret = OrSIMD ( a , AndSIMD ( Four_Epsilons , zero_mask ) ) ;
ret = ReciprocalEstSIMD ( ret ) ;
return ret ;
}
/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
/// No error checking!
FORCEINLINE fltx4 ReciprocalSIMD ( const fltx4 & a ) // 1/a
{
// This matches standard library function recipf4
fltx4 result ;
vmathV4RecipPerElem ( ( VmathVector4 * ) & result , ( const VmathVector4 * ) & a ) ;
return result ;
}
FORCEINLINE fltx4 DivSIMD ( const fltx4 & a , const fltx4 & b ) // a/b
{
return MulSIMD ( ReciprocalSIMD ( b ) , a ) ;
}
FORCEINLINE fltx4 DivEstSIMD ( const fltx4 & a , const fltx4 & b ) // Est(a/b)
{
return MulSIMD ( ReciprocalEstSIMD ( b ) , a ) ;
}
/// 1/x for all 4 values.
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalSaturateSIMD ( const fltx4 & a )
{
// Convert zeros to epsilons
bi32x4 zero_mask = CmpEqSIMD ( a , _VEC_ZEROF ) ;
fltx4 a_safe = OrSIMD ( a , AndSIMD ( _VEC_EPSILONF , zero_mask ) ) ;
return ReciprocalSIMD ( a_safe ) ;
// FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
// fltx4 zeroMask = CmpEqSIMD( gFour_Zeros, a );
// fltx4 a_safe = XMVectorSelect( a, gFour_Epsilons, zeroMask );
// return ReciprocalSIMD( a_safe );
}
// CHRISG: is it worth doing integer bitfiddling for this?
// 2^x for all values (the antilog)
FORCEINLINE fltx4 ExpSIMD ( const fltx4 & toPower )
{
return exp2f4 ( toPower ) ;
}
// a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point",
// which is accurate to four bits of mantissa.
FORCEINLINE fltx4 Exp2EstSIMD ( const fltx4 & f )
{
return exp2f4fast ( f ) ;
}
// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD ( FLTX4 in , FLTX4 min , FLTX4 max )
{
fltx4 result = vec_max ( min , in ) ;
return vec_min ( max , result ) ;
}
FORCEINLINE fltx4 LoadUnalignedSIMD ( const void * pSIMD )
{
# if SN_IMPROVED_INTRINSICS
fltx4 v0 , v1 ;
Assert ( pSIMD ) ;
v0 = ( fltx4 ) vec_lvlx ( 0 , ( float * ) pSIMD ) ;
v1 = ( fltx4 ) vec_lvrx ( 16 , ( float * ) pSIMD ) ;
return vec_or ( v0 , v1 ) ;
# else
fltx4 v0 , v1 ;
vector unsigned char permMask ;
Assert ( pSIMD ) ;
v0 = vec_ld ( 0 , pSIMD ) ;
permMask = vec_lvsl ( 0 , pSIMD ) ;
v1 = vec_ld ( 15 , pSIMD ) ;
return vec_perm ( v0 , v1 , permMask ) ;
# endif
}
FORCEINLINE fltx4 LoadUnsignedByte4SIMD ( unsigned char * pBytes ) // unpack contiguous 4 bytes into vec float 4
{
# if SN_IMPROVED_INTRINSICS
__vector unsigned char res_uc ;
__vector unsigned short res_us ;
__vector unsigned char vZero8 = ( __vector unsigned char ) vec_splat_u8 ( 0 ) ;
__vector unsigned short vZero16 = ( __vector unsigned short ) vec_splat_u16 ( 0 ) ;
res_uc = ( __vector unsigned char ) vec_lvlx ( 0 , pBytes ) ;
res_uc = vec_mergeh ( vZero8 , res_uc ) ;
res_us = vec_mergeh ( vZero16 , ( __vector unsigned short ) res_uc ) ;
return vec_ctf ( ( __vector unsigned int ) res_us , 0 ) ;
# else
vector unsigned char v0 , v1 ;
vector bool char res_uc ;
vector unsigned char permMask ;
vector bool short res_us ;
vector bool char vZero8 = ( vector bool char ) vec_splat_u8 ( 0 ) ;
vector bool short vZero16 = ( vector bool short ) vec_splat_u16 ( 0 ) ;
v0 = vec_ld ( 0 , pBytes ) ;
permMask = vec_lvsl ( 0 , pBytes ) ;
v1 = vec_ld ( 3 , pBytes ) ;
res_uc = ( vector bool char ) vec_perm ( v0 , v1 , permMask ) ;
res_uc = vec_mergeh ( vZero8 , res_uc ) ;
res_us = vec_mergeh ( vZero16 , ( vector bool short ) res_uc ) ;
return vec_ctf ( ( vector unsigned int ) res_us , 0 ) ;
# endif
}
FORCEINLINE fltx4 LoadSignedByte4SIMD ( signed char * pBytes ) // unpack contiguous 4 bytes into vec float 4
{
# if SN_IMPROVED_INTRINSICS
vector signed char res_uc ;
vector signed short res_us ;
vector signed int res_ui ;
res_uc = ( vector signed char ) vec_lvlx ( 0 , pBytes ) ;
res_us = vec_unpackh ( res_uc ) ;
res_ui = vec_unpackh ( res_us ) ;
return vec_ctf ( res_ui , 0 ) ;
# else
vector signed char v0 , v1 , res_uc ;
vector unsigned char permMask ;
vector signed short res_us ;
vector signed int res_ui ;
v0 = vec_ld ( 0 , pBytes ) ;
permMask = vec_lvsl ( 0 , pBytes ) ;
v1 = vec_ld ( 3 , pBytes ) ;
res_uc = vec_perm ( v0 , v1 , permMask ) ;
res_us = vec_unpackh ( res_uc ) ;
res_ui = vec_unpackh ( res_us ) ;
return vec_ctf ( res_ui , 0 ) ;
# endif
}
// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
FORCEINLINE fltx4 LoadUnaligned3SIMD ( const void * pSIMD )
{
Assert ( pSIMD ) ;
fltx4 v0 = vec_ld ( 0 , ( float * ) ( pSIMD ) ) ;
vector unsigned char permMask = vec_lvsl ( 0 , ( float * ) ( pSIMD ) ) ;
fltx4 v1 = vec_ld ( 11 , ( float * ) ( pSIMD ) ) ;
return vec_perm ( v0 , v1 , permMask ) ;
}
// load a single unaligned float into the x component of a SIMD word
FORCEINLINE fltx4 LoadUnalignedFloatSIMD ( const float * pFlt )
{
fltx4 v0 = vec_lde ( 0 , const_cast < float * > ( pFlt ) ) ;
vector unsigned char permMask = vec_lvsl ( 0 , const_cast < float * > ( pFlt ) ) ;
return vec_perm ( v0 , v0 , permMask ) ;
}
FORCEINLINE fltx4 LoadAlignedSIMD ( const void * pSIMD )
{
return vec_ld ( 0 , ( float * ) pSIMD ) ;
}
# ifndef SPU
// No reason to support VectorAligned on SPU.
// for the transitional class -- load a 3-by VectorAligned and squash its w component
FORCEINLINE fltx4 LoadAlignedSIMD ( const VectorAligned & pSIMD )
{
fltx4 out ;
out = vec_ld ( 0 , pSIMD . Base ( ) ) ;
// squelch w
return ( fltx4 ) vec_and ( ( u32x4 ) out , _VEC_CLEAR_WMASK ) ;
}
// for the transitional class -- load a 3-by VectorAligned and squash its w component
FORCEINLINE fltx4 LoadAlignedSIMD ( const VectorAligned * RESTRICT pSIMD )
{
fltx4 out ;
out = vec_ld ( 0 , pSIMD - > Base ( ) ) ;
// squelch w
return ( fltx4 ) vec_and ( ( u32x4 ) out , _VEC_CLEAR_WMASK ) ;
}
// strongly typed -- for typechecking as we transition to SIMD
FORCEINLINE void StoreAligned3SIMD ( VectorAligned * RESTRICT pSIMD , const fltx4 & a )
{
vec_st ( a , 0 , pSIMD - > Base ( ) ) ;
}
# endif
FORCEINLINE void StoreAlignedSIMD ( float * pSIMD , const fltx4 & a )
{
vec_st ( a , 0 , pSIMD ) ;
}
FORCEINLINE void StoreUnalignedSIMD ( float * pSIMD , const fltx4 & a )
{
# if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
vec_stvlx ( a , 0 , pSIMD ) ;
vec_stvrx ( a , 16 , pSIMD ) ;
# else
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxf ) ;
pSIMD [ 0 ] = a_union . m128_f32 [ 0 ] ;
pSIMD [ 1 ] = a_union . m128_f32 [ 1 ] ;
pSIMD [ 2 ] = a_union . m128_f32 [ 2 ] ;
pSIMD [ 3 ] = a_union . m128_f32 [ 3 ] ;
# endif
}
FORCEINLINE void StoreUnaligned3SIMD ( float * pSIMD , const fltx4 & a )
{
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxf ) ;
pSIMD [ 0 ] = a_union . m128_f32 [ 0 ] ;
pSIMD [ 1 ] = a_union . m128_f32 [ 1 ] ;
pSIMD [ 2 ] = a_union . m128_f32 [ 2 ] ;
} ;
# ifndef SPU
// No reason to support unaligned Vectors on SPU
FORCEINLINE fltx4 Compress4SIMD ( fltx4 const a , fltx4 const & b , fltx4 const & c , fltx4 const & d ) ;
// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
FORCEINLINE fltx4 LoadGatherSIMD ( const float & x , const float & y , const float & z , const float & w )
{
# if USING_POINTLESSLY_SLOW_SONY_CODE
return vmathV4MakeFromElems_V ( x , y , z , w ) . vec128 ;
# else
// load the float into the low word of each vector register (this exploits the unaligned load op)
fltx4 vx = vec_lvlx ( 0 , & x ) ;
fltx4 vy = vec_lvlx ( 0 , & y ) ;
fltx4 vz = vec_lvlx ( 0 , & z ) ;
fltx4 vw = vec_lvlx ( 0 , & w ) ;
return Compress4SIMD ( vx , vy , vz , vw ) ;
# endif
}
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination[0], pDestination[1], pDestination[2], pDestination[3]
// The Vectors are assumed to be unaligned.
FORCEINLINE void StoreFourUnalignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
2023-01-31 14:31:17 +01:00
Vector3D * const pDestination )
2022-07-08 00:55:01 +02:00
{
StoreUnaligned3SIMD ( pDestination - > Base ( ) , a ) ;
StoreUnaligned3SIMD ( ( pDestination + 1 ) - > Base ( ) , b ) ;
StoreUnaligned3SIMD ( ( pDestination + 2 ) - > Base ( ) , c ) ;
StoreUnaligned3SIMD ( ( pDestination + 3 ) - > Base ( ) , d ) ;
}
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination , pDestination + 1, pDestination + 2, pDestination + 3
// The Vectors are assumed to start on an ALIGNED address, that is,
2022-09-09 19:47:31 +02:00
// pDestination is 16-byte aligned (though obviously pDestination+1 is not).
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreFourAlignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
2023-01-31 14:31:17 +01:00
Vector3D * const pDestination )
2022-07-08 00:55:01 +02:00
{
StoreUnaligned3SIMD ( pDestination - > Base ( ) , a ) ;
StoreUnaligned3SIMD ( ( pDestination + 1 ) - > Base ( ) , b ) ;
StoreUnaligned3SIMD ( ( pDestination + 2 ) - > Base ( ) , c ) ;
StoreUnaligned3SIMD ( ( pDestination + 3 ) - > Base ( ) , d ) ;
}
# endif
// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD ( intx4 * RESTRICT pDest , const fltx4 & vSrc )
{
i32x4 asInt = vec_cts ( vSrc , 0 ) ;
vec_st ( asInt , 0 , pDest - > Base ( ) ) ;
}
FORCEINLINE void TransposeSIMD ( fltx4 & x , fltx4 & y , fltx4 & z , fltx4 & w )
{
fltx4 p0 , p1 , p2 , p3 ;
p0 = vec_mergeh ( x , z ) ;
p1 = vec_mergeh ( y , w ) ;
p2 = vec_mergel ( x , z ) ;
p3 = vec_mergel ( y , w ) ;
x = vec_mergeh ( p0 , p1 ) ;
y = vec_mergel ( p0 , p1 ) ;
z = vec_mergeh ( p2 , p3 ) ;
w = vec_mergel ( p2 , p3 ) ;
}
// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD ( void )
{
return _VEC_ZEROF ;
}
FORCEINLINE i32x4 LoadZeroISIMD ( void )
{
return vec_splat_s32 ( 0 ) ;
}
// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD ( void )
{
return _VEC_ONEF ;
}
FORCEINLINE i32x4 LoadOneISIMD ( void )
{
return vec_splat_s32 ( 1 ) ;
}
FORCEINLINE fltx4 SplatXSIMD ( fltx4 a )
{
return vec_splat ( a , 0 ) ;
}
FORCEINLINE fltx4 SplatYSIMD ( fltx4 a )
{
return vec_splat ( a , 1 ) ;
}
FORCEINLINE fltx4 SplatZSIMD ( fltx4 a )
{
return vec_splat ( a , 2 ) ;
}
FORCEINLINE fltx4 SplatWSIMD ( fltx4 a )
{
return vec_splat ( a , 3 ) ;
}
FORCEINLINE bi32x4 SplatXSIMD ( bi32x4 a )
{
return vec_splat ( a , 0 ) ;
}
FORCEINLINE bi32x4 SplatYSIMD ( bi32x4 a )
{
return vec_splat ( a , 1 ) ;
}
FORCEINLINE bi32x4 SplatZSIMD ( bi32x4 a )
{
return vec_splat ( a , 2 ) ;
}
FORCEINLINE bi32x4 SplatWSIMD ( bi32x4 a )
{
return vec_splat ( a , 3 ) ;
}
FORCEINLINE fltx4 SetXSIMD ( const fltx4 & a , const fltx4 & x )
{
return vec_sel ( a , x , _VEC_COMPONENT_MASK_0 ) ;
}
FORCEINLINE fltx4 SetYSIMD ( const fltx4 & a , const fltx4 & y )
{
return vec_sel ( a , y , _VEC_COMPONENT_MASK_1 ) ;
}
FORCEINLINE fltx4 SetZSIMD ( const fltx4 & a , const fltx4 & z )
{
return vec_sel ( a , z , _VEC_COMPONENT_MASK_2 ) ;
}
FORCEINLINE fltx4 SetWSIMD ( const fltx4 & a , const fltx4 & w )
{
return vec_sel ( a , w , _VEC_COMPONENT_MASK_3 ) ;
}
FORCEINLINE fltx4 SetComponentSIMD ( const fltx4 & a , int nComponent , float flValue )
{
# if SN_IMPROVED_INTRINSICS
return vec_insert ( flValue , a , nComponent ) ;
# else
fltx4_union a_union ;
a_union . vmxf = vec_ld ( 0 , & a ) ;
a_union . m128_f32 [ nComponent ] = flValue ;
return a_union . vmxf ;
# endif
}
FORCEINLINE float GetComponentSIMD ( const fltx4 & a , int nComponent )
{
# if SN_IMPROVED_INTRINSICS
return vec_extract ( a , nComponent ) ;
# else
fltx4_union a_union ;
a_union . vmxf = vec_ld ( 0 , & a ) ;
return a_union . m128_f32 [ nComponent ] ;
# endif
}
FORCEINLINE fltx4 RotateLeft ( const fltx4 & a )
{
return vec_sld ( a , a , 4 ) ;
}
FORCEINLINE fltx4 RotateLeft2 ( const fltx4 & a )
{
return vec_sld ( a , a , 8 ) ;
}
FORCEINLINE fltx4 RotateRight ( const fltx4 & a )
{
return vec_sld ( a , a , 12 ) ;
}
FORCEINLINE fltx4 RotateRight2 ( const fltx4 & a )
{
return vec_sld ( a , a , 8 ) ;
}
// rotate a vector left by an arbitrary number of
// bits known at compile time. The bit parameter
// is template because it's actually used as an
// immediate field in an instruction, eg it absolutely
// must be known at compile time. nBits>127 leads
// to doom.
// zeroes are shifted in from the right
template < uint nBits , typename T >
FORCEINLINE T ShiftLeftByBits ( const T & a )
{
// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
if ( nBits > = 128 ) // WTF are you doing?!
{
return ( T ) LoadZeroSIMD ( ) ;
}
else if ( nBits = = 0 )
{
return a ;
}
else if ( ( nBits > 7 ) ) // if we have to rotate by at least one byte, do the by-octet rotation first
{
T t = vec_sld ( a , ( ( T ) LoadZeroSIMD ( ) ) , ( nBits > > 3 ) ) ; // rotated left by octets
return ShiftLeftByBits < ( nBits & 0x7 ) > ( t ) ;
}
else // we need to rotate by <= 7 bits
{
// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
// the splat, however, does require an immediate. Go IBM!
vector unsigned int shifter = ( vector unsigned int ) ( vec_splat_s8 ( ( ( signed char ) ( nBits & 0x7 ) ) ) ) ;
return ( T ) vec_sll ( ( vector signed int ) a , shifter ) ;
}
}
// as above, but shift right
template < uint nBits , typename T >
FORCEINLINE T ShiftRightByBits ( const T & a )
{
// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
if ( nBits > = 128 ) // WTF are you doing?!
{
return ( T ) LoadZeroSIMD ( ) ;
}
else if ( nBits = = 0 )
{
return a ;
}
else if ( ( nBits > 7 ) ) // if we have to rotate by at least one byte, do the by-octet rotation first
{
T t = vec_sld ( ( ( T ) LoadZeroSIMD ( ) ) , a , 16 - ( nBits > > 3 ) ) ; // rotated right by octets -- a rotate right of one is like a rotate left of fifteen.
return ShiftRightByBits < ( nBits & 0x7 ) > ( t ) ;
}
else // we need to rotate by <= 7 bits
{
// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
// the splat, however, does require an immediate. Go IBM!
vector unsigned int shifter = ( vector unsigned int ) ( vec_splat_s8 ( ( ( signed char ) ( nBits & 0x7 ) ) ) ) ;
return ( T ) vec_srl ( ( vector unsigned int ) a , shifter ) ;
}
}
/**** an example of ShiftLeftByBits:
fltx4 ShiftByTwentyOne ( fltx4 foo )
{
return ShiftLeftByBits < 21 > ( foo ) ;
}
compiles to :
ShiftByTwentyOne ( float __vector ) :
0x000059FC : 0x1060038C vspltisw v3 , 0 PIPE
0x00005A00 : 0x1085030C vspltisb v4 , 5
0x00005A04 : 0x104218AC vsldoi v2 , v2 , v3 , 2 02 ( 000059F C ) REG PIPE
0x00005A08 : 0x104221C4 vsl v2 , v2 , v4 03 ( 00005 A04 ) REG
0x00005A0C : 0x4E800020 blr
* * * * */
// find the lowest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindLowestSIMD3 ( const fltx4 & a )
{
fltx4 result ;
fltx4 x = vec_splat ( a , 0 ) ;
fltx4 y = vec_splat ( a , 1 ) ;
fltx4 z = vec_splat ( a , 2 ) ;
if ( vec_any_nan ( a ) )
{
x = vec_all_nan ( x ) ? _VEC_FLTMAX : x ;
y = vec_all_nan ( y ) ? _VEC_FLTMAX : y ;
z = vec_all_nan ( z ) ? _VEC_FLTMAX : z ;
}
result = vec_min ( y , x ) ;
result = vec_min ( z , result ) ;
return result ;
}
// find the highest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Though this is only five instructions long,
// they are all dependent, making this stall city.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindHighestSIMD3 ( const fltx4 & a )
{
fltx4 result ;
fltx4 x = vec_splat ( a , 0 ) ;
fltx4 y = vec_splat ( a , 1 ) ;
fltx4 z = vec_splat ( a , 2 ) ;
if ( vec_any_nan ( a ) )
{
x = vec_all_nan ( x ) ? _VEC_FLTMIN : x ;
y = vec_all_nan ( y ) ? _VEC_FLTMIN : y ;
z = vec_all_nan ( z ) ? _VEC_FLTMIN : z ;
}
result = vec_max ( y , x ) ;
result = vec_max ( z , result ) ;
return result ;
}
// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------
// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD ( const int32 * RESTRICT pSIMD )
{
return vec_ld ( 0 , const_cast < int32 * > ( pSIMD ) ) ;
}
// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD ( const int32 * RESTRICT pSIMD )
{
i32x4 v0 , v1 ;
vector unsigned char permMask ;
Assert ( pSIMD ) ;
v0 = vec_ld ( 0 , const_cast < int32 * > ( pSIMD ) ) ;
permMask = vec_lvsl ( 0 , const_cast < int32 * > ( pSIMD ) ) ;
v1 = vec_ld ( 15 , const_cast < int32 * > ( pSIMD ) ) ;
return vec_perm ( v0 , v1 , permMask ) ;
}
// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD ( int32 * pSIMD , const i32x4 & a )
{
vec_st ( a , 0 , pSIMD ) ;
}
FORCEINLINE void StoreAlignedIntSIMD ( int32 * pSIMD , const fltx4 & a )
{
vec_st ( ( i32x4 ) a , 0 , pSIMD ) ;
}
FORCEINLINE void StoreAlignedIntSIMD ( intx4 & pSIMD , const i32x4 & a )
{
vec_st ( a , 0 , pSIMD . Base ( ) ) ;
}
FORCEINLINE void StoreUnalignedIntSIMD ( int32 * pSIMD , const i32x4 & a )
{
# if SN_IMPROVED_INTRINSICS
// NOTE : NOT TESTED
vec_stvlx ( a , 0 , pSIMD ) ;
vec_stvrx ( a , 16 , pSIMD ) ;
# else
fltx4_union tmp ;
vec_st ( a , 0 , & tmp . vmxi ) ;
pSIMD [ 0 ] = tmp . m128_u32 [ 0 ] ;
pSIMD [ 1 ] = tmp . m128_u32 [ 1 ] ;
pSIMD [ 2 ] = tmp . m128_u32 [ 2 ] ;
pSIMD [ 3 ] = tmp . m128_u32 [ 3 ] ;
# endif
}
// a={ a.x, a.z, b.x, b.z }
// combine two fltx4s by throwing away every other field.
FORCEINLINE fltx4 CompressSIMD ( fltx4 const & a , fltx4 const & b )
{
const int32 ALIGN16 n4shuffleACXZ [ 4 ] ALIGN16_POST = { 0x00010203 , 0x08090A0B , 0x10111213 , 0x18191A1B } ;
return vec_perm ( a , b , ( vec_uchar16 ) LoadAlignedIntSIMD ( n4shuffleACXZ ) ) ;
}
// a={ a.x, b.x, c.x, d.x }
// combine 4 fltx4s by throwing away 3/4s of the fields
// TODO: make more efficient by doing this in a parallel way at the caller
// Compress4SIMD(FourVectors.. )
FORCEINLINE fltx4 Compress4SIMD ( fltx4 const a , fltx4 const & b , fltx4 const & c , fltx4 const & d )
{
fltx4 ab = vec_mergeh ( a , b ) ; // a.x, b.x, a.y, b.y
fltx4 cd = vec_mergeh ( c , d ) ; // c.x, d.x...
static const int32 ALIGN16 shuffleABXY [ 4 ] ALIGN16_POST = { 0x00010203 , 0x04050607 , 0x10111213 , 0x14151617 } ;
return vec_perm ( ab , cd , ( vec_uchar16 ) LoadAlignedIntSIMD ( shuffleABXY ) ) ;
}
// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD ( const i32x4 & vSrcA )
{
return vec_ctf ( vSrcA , 0 ) ;
}
// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const i32x4 & vSrcA )
{
return vec_ctf ( vSrcA , 0 ) ;
}
// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. Each uint
// will be divided by 2^immed after conversion
// (eg, this is fixed point math).
/* as if:
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD ( const i32x4 & vSrcA , unsigned int uImmed )
{
return vec_ctf ( vSrcA , uImmed ) ;
}
*/
# define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) ))
// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. Each int
// will be divided by 2^immed (eg, this is fixed point
// math).
/* as if:
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const i32x4 & vSrcA , unsigned int uImmed )
{
return vec_ctf ( vSrcA , uImmed ) ;
}
*/
# define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) ))
// set all components of a vector to a signed immediate int number.
/* as if:
FORCEINLINE fltx4 IntSetImmediateSIMD ( int toImmediate )
{
return vec_splat_s32 ( toImmediate ) ;
}
*/
# define IntSetImmediateSIMD(x) (vec_splat_s32(x))
/*
works on fltx4 ' s as if they are four uints .
the first parameter contains the words to be shifted ,
the second contains the amount to shift by AS INTS
for i = 0 to 3
shift = vSrcB_i * 32 : ( i * 32 ) + 4
vReturned_i * 32 : ( i * 32 ) + 31 = vSrcA_i * 32 : ( i * 32 ) + 31 < < shift
*/
FORCEINLINE u32x4 IntShiftLeftWordSIMD ( u32x4 vSrcA , u32x4 vSrcB )
{
return vec_sl ( vSrcA , vSrcB ) ;
}
FORCEINLINE float SubFloat ( const fltx4 & a , int idx )
{
# if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
return ( vec_extract ( a , idx ) ) ;
# else // GCC 4.1.1
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxf ) ;
return a_union . m128_f32 [ idx ] ;
# endif // GCC 4.1.1
}
FORCEINLINE float & SubFloat ( fltx4 & a , int idx )
{
fltx4_union & a_union = ( fltx4_union & ) a ;
return a_union . m128_f32 [ idx ] ;
}
FORCEINLINE uint32 SubInt ( const u32x4 & a , int idx )
{
# if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
return ( vec_extract ( a , idx ) ) ;
# else // GCC 4.1.1
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxui ) ;
return a_union . m128_u32 [ idx ] ;
# endif // GCC 4.1.1
}
FORCEINLINE uint32 SubInt ( const fltx4 & a , int idx )
{
# if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
return ( vec_extract ( ( u32x4 ) a , idx ) ) ;
# else
fltx4_union a_union ;
vec_st ( a , 0 , & a_union . vmxf ) ;
return a_union . m128_u32 [ idx ] ;
# endif
}
FORCEINLINE uint32 & SubInt ( u32x4 & a , int idx )
{
fltx4_union & a_union = ( fltx4_union & ) a ;
return a_union . m128_u32 [ idx ] ;
}
FORCEINLINE uint32 SubFloatConvertToInt ( const fltx4 & a , int idx )
{
# if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
return ( vec_extract ( vec_ctu ( a , 0 ) , idx ) ) ;
# else
u32x4 t = vec_ctu ( a , 0 ) ;
return SubInt ( t , idx ) ;
# endif
}
// perform an Altivec permute op. There is no corresponding SSE op, so
// this function is missing from that fork. This is deliberate, because
// permute-based algorithms simply need to be abandoned and rebuilt
// differently way for SSE.
// (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm )
template < typename T , typename U >
FORCEINLINE T PermuteVMX ( T a , T b , U swizzleMask )
{
return vec_perm ( a , b , ( vec_uchar16 ) swizzleMask ) ;
}
// __fsel(double fComparand, double fValGE, double fLT) == fComparand >= 0 ? fValGE : fLT
// this is much faster than if ( aFloat > 0 ) { x = .. }
# if !defined(__SPU__)
# define fsel __fsel
# endif
inline bool IsVector3LessThan ( const fltx4 & v1 , const fltx4 & v2 )
{
return vec_any_lt ( v1 , v2 ) ;
}
inline bool IsVector3GreaterOrEqual ( const fltx4 & v1 , const fltx4 & v2 )
{
return ! IsVector3LessThan ( v1 , v2 ) ;
}
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = 1.0 / sqrt ( SubFloat ( a , 0 ) ! = 0.0f ? SubFloat ( a , 0 ) : FLT_EPSILON ) ;
SubFloat ( retVal , 1 ) = 1.0 / sqrt ( SubFloat ( a , 1 ) ! = 0.0f ? SubFloat ( a , 1 ) : FLT_EPSILON ) ;
SubFloat ( retVal , 2 ) = 1.0 / sqrt ( SubFloat ( a , 2 ) ! = 0.0f ? SubFloat ( a , 2 ) : FLT_EPSILON ) ;
SubFloat ( retVal , 3 ) = 1.0 / sqrt ( SubFloat ( a , 3 ) ! = 0.0f ? SubFloat ( a , 3 ) : FLT_EPSILON ) ;
return retVal ;
}
// Round towards negative infinity
FORCEINLINE fltx4 FloorSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = floor ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = floor ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = floor ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = floor ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
# elif ( defined( _X360 ) )
//---------------------------------------------------------------------
// X360 implementation
//---------------------------------------------------------------------
inline bool IsVector3LessThan ( const fltx4 & v1 , const fltx4 & v2 )
{
return ! XMVector3GreaterOrEqual ( v1 , v2 ) ;
}
inline BOOL IsVector3GreaterOrEqual ( const fltx4 & v1 , const fltx4 & v2 )
{
return XMVector3GreaterOrEqual ( v1 , v2 ) ;
}
FORCEINLINE float & FloatSIMD ( fltx4 & a , int idx )
{
fltx4_union & a_union = ( fltx4_union & ) a ;
return a_union . m128_f32 [ idx ] ;
}
FORCEINLINE unsigned int & UIntSIMD ( fltx4 & a , int idx )
{
fltx4_union & a_union = ( fltx4_union & ) a ;
return a_union . m128_u32 [ idx ] ;
}
FORCEINLINE fltx4 AddSIMD ( const fltx4 & a , const fltx4 & b )
{
return __vaddfp ( a , b ) ;
}
FORCEINLINE fltx4 SubSIMD ( const fltx4 & a , const fltx4 & b ) // a-b
{
return __vsubfp ( a , b ) ;
}
FORCEINLINE fltx4 MulSIMD ( const fltx4 & a , const fltx4 & b ) // a*b
{
return __vmulfp ( a , b ) ;
}
FORCEINLINE fltx4 MaddSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // a*b + c
{
return __vmaddfp ( a , b , c ) ;
}
FORCEINLINE fltx4 MsubSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // c - a*b
{
return __vnmsubfp ( a , b , c ) ;
} ;
FORCEINLINE fltx4 Dot3SIMD ( const fltx4 & a , const fltx4 & b )
{
return __vmsum3fp ( a , b ) ;
}
FORCEINLINE fltx4 Dot4SIMD ( const fltx4 & a , const fltx4 & b )
{
return __vmsum4fp ( a , b ) ;
}
FORCEINLINE fltx4 SinSIMD ( const fltx4 & radians )
{
return XMVectorSin ( radians ) ;
}
FORCEINLINE void SinCos3SIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians )
{
XMVectorSinCos ( & sine , & cosine , radians ) ;
}
FORCEINLINE void SinCosSIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians )
{
XMVectorSinCos ( & sine , & cosine , radians ) ;
}
FORCEINLINE void CosSIMD ( fltx4 & cosine , const fltx4 & radians )
{
cosine = XMVectorCos ( radians ) ;
}
FORCEINLINE fltx4 ArcSinSIMD ( const fltx4 & sine )
{
return XMVectorASin ( sine ) ;
}
FORCEINLINE fltx4 ArcCosSIMD ( const fltx4 & cs )
{
return XMVectorACos ( cs ) ;
}
// tan^1(a/b) .. ie, pass sin in as a and cos in as b
FORCEINLINE fltx4 ArcTan2SIMD ( const fltx4 & a , const fltx4 & b )
{
return XMVectorATan2 ( a , b ) ;
}
// DivSIMD defined further down, since it uses ReciprocalSIMD
FORCEINLINE fltx4 MaxSIMD ( const fltx4 & a , const fltx4 & b ) // max(a,b)
{
return __vmaxfp ( a , b ) ;
}
FORCEINLINE fltx4 MinSIMD ( const fltx4 & a , const fltx4 & b ) // min(a,b)
{
return __vminfp ( a , b ) ;
}
FORCEINLINE fltx4 AndSIMD ( const fltx4 & a , const fltx4 & b ) // a & b
{
return __vand ( a , b ) ;
}
FORCEINLINE fltx4 AndNotSIMD ( const fltx4 & a , const fltx4 & b ) // ~a & b
{
// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
return __vandc ( b , a ) ;
}
FORCEINLINE fltx4 XorSIMD ( const fltx4 & a , const fltx4 & b ) // a ^ b
{
return __vxor ( a , b ) ;
}
FORCEINLINE fltx4 OrSIMD ( const fltx4 & a , const fltx4 & b ) // a | b
{
return __vor ( a , b ) ;
}
FORCEINLINE fltx4 NegSIMD ( const fltx4 & a ) // negate: -a
{
return XMVectorNegate ( a ) ;
}
FORCEINLINE bool IsAllZeros ( const fltx4 & a ) // all floats of a zero?
{
unsigned int equalFlags = 0 ;
__vcmpeqfpR ( a , Four_Zeros , & equalFlags ) ;
return XMComparisonAllTrue ( equalFlags ) ;
}
FORCEINLINE bool IsAnyZeros ( const fltx4 & a ) // any floats are zero?
{
unsigned int conditionregister ;
XMVectorEqualR ( & conditionregister , a , XMVectorZero ( ) ) ;
return XMComparisonAnyTrue ( conditionregister ) ;
}
FORCEINLINE bool IsAnyXYZZero ( const fltx4 & a ) // are any of x,y,z zero?
{
// copy a's x component into w, in case w was zero.
fltx4 temp = __vrlimi ( a , a , 1 , 1 ) ;
unsigned int conditionregister ;
XMVectorEqualR ( & conditionregister , temp , XMVectorZero ( ) ) ;
return XMComparisonAnyTrue ( conditionregister ) ;
}
// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan ( const fltx4 & a , const fltx4 & b )
{
unsigned int cr ;
XMVectorGreaterR ( & cr , a , b ) ;
return XMComparisonAllTrue ( cr ) ;
}
// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq ( const fltx4 & a , const fltx4 & b )
{
unsigned int cr ;
XMVectorGreaterOrEqualR ( & cr , a , b ) ;
return XMComparisonAllTrue ( cr ) ;
}
// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAnyGreaterThan ( const fltx4 & a , const fltx4 & b )
{
unsigned int cr ;
XMVectorGreaterR ( & cr , a , b ) ;
return XMComparisonAnyTrue ( cr ) ;
}
// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAnyGreaterThanOrEq ( const fltx4 & a , const fltx4 & b )
{
unsigned int cr ;
XMVectorGreaterOrEqualR ( & cr , a , b ) ;
return XMComparisonAnyTrue ( cr ) ;
}
// For branching if all a.xyzw == b.xyzw
FORCEINLINE bool IsAllEqual ( const fltx4 & a , const fltx4 & b )
{
unsigned int cr ;
XMVectorEqualR ( & cr , a , b ) ;
return XMComparisonAllTrue ( cr ) ;
}
FORCEINLINE int TestSignSIMD ( const fltx4 & a ) // mask of which floats have the high bit set
{
// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
int nRet = 0 ;
const fltx4_union & a_union = ( const fltx4_union & ) a ;
nRet | = ( a_union . m128_u32 [ 0 ] & 0x80000000 ) > > 31 ; // sign(x) -> bit 0
nRet | = ( a_union . m128_u32 [ 1 ] & 0x80000000 ) > > 30 ; // sign(y) -> bit 1
nRet | = ( a_union . m128_u32 [ 2 ] & 0x80000000 ) > > 29 ; // sign(z) -> bit 2
nRet | = ( a_union . m128_u32 [ 3 ] & 0x80000000 ) > > 28 ; // sign(w) -> bit 3
return nRet ;
}
// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD ( const fltx4 & a )
{
return __vrlimi ( a , __vzero ( ) , 1 , 0 ) ;
}
FORCEINLINE bool IsAnyNegative ( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
// NOTE: this tests the top bits of each vector element using integer math
// (so it ignores NaNs - it will return true for "-NaN")
unsigned int equalFlags = 0 ;
fltx4 signMask = __vspltisw ( - 1 ) ; // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
signMask = __vslw ( signMask , signMask ) ; // 0x80000000 0x80000000 0x80000000 0x80000000
__vcmpequwR ( Four_Zeros , __vand ( signMask , a ) , & equalFlags ) ;
return ! XMComparisonAllTrue ( equalFlags ) ;
}
FORCEINLINE bool IsAnyTrue ( const fltx4 & a )
{
unsigned int equalFlags = 0 ;
__vcmpequwR ( Four_Zeros , a , & equalFlags ) ; // compare to zero
return XMComparisonAnyFalse ( equalFlags ) ; // at least one element was not zero, eg was true
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 CmpEqSIMD ( const fltx4 & a , const fltx4 & b ) // (a==b) ? ~0:0
{
return __vcmpeqfp ( a , b ) ;
}
FORCEINLINE fltx4 CmpGtSIMD ( const fltx4 & a , const fltx4 & b ) // (a>b) ? ~0:0
{
return __vcmpgtfp ( a , b ) ;
}
FORCEINLINE fltx4 CmpGeSIMD ( const fltx4 & a , const fltx4 & b ) // (a>=b) ? ~0:0
{
return __vcmpgefp ( a , b ) ;
}
FORCEINLINE fltx4 CmpLtSIMD ( const fltx4 & a , const fltx4 & b ) // (a<b) ? ~0:0
{
return __vcmpgtfp ( b , a ) ;
}
FORCEINLINE fltx4 CmpLeSIMD ( const fltx4 & a , const fltx4 & b ) // (a<=b) ? ~0:0
{
return __vcmpgefp ( b , a ) ;
}
FORCEINLINE fltx4 CmpInBoundsSIMD ( const fltx4 & a , const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
{
return XMVectorInBounds ( a , b ) ;
}
// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
FORCEINLINE fltx4 MaskedAssign ( const fltx4 & ReplacementMask , const fltx4 & NewValue , const fltx4 & OldValue )
{
return __vsel ( OldValue , NewValue , ReplacementMask ) ;
}
2022-07-08 00:55:01 +02:00
// perform an Altivec permute op. There is no corresponding SSE op, so
// this function is missing from that fork. This is deliberate, because
// permute-based algorithms simply need to be abandoned and rebuilt
// differently way for SSE.
// (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm )
template < typename T , typename U >
FORCEINLINE T PermuteVMX ( T a , T b , U swizzleMask )
{
return __vperm ( a , b , swizzleMask ) ;
}
2022-07-06 21:11:32 +02:00
// AKA "Broadcast", "Splat"
FORCEINLINE fltx4 ReplicateX4 ( float flValue ) // a,a,a,a
{
// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
float * pValue = & flValue ;
Assert ( pValue ) ;
Assert ( ( ( unsigned int ) pValue & 3 ) = = 0 ) ;
return __vspltw ( __lvlx ( pValue , 0 ) , 0 ) ;
}
FORCEINLINE fltx4 ReplicateX4 ( const float * pValue ) // a,a,a,a
{
Assert ( pValue ) ;
return __vspltw ( __lvlx ( pValue , 0 ) , 0 ) ;
}
/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE fltx4 ReplicateIX4 ( int nValue )
{
// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
int * pValue = & nValue ;
Assert ( pValue ) ;
Assert ( ( ( unsigned int ) pValue & 3 ) = = 0 ) ;
return __vspltw ( __lvlx ( pValue , 0 ) , 0 ) ;
}
// Round towards positive infinity
FORCEINLINE fltx4 CeilSIMD ( const fltx4 & a )
{
return __vrfip ( a ) ;
}
// Round towards nearest integer
FORCEINLINE fltx4 RoundSIMD ( const fltx4 & a )
{
return __vrfin ( a ) ;
}
// Round towards negative infinity
FORCEINLINE fltx4 FloorSIMD ( const fltx4 & a )
{
return __vrfim ( a ) ;
}
FORCEINLINE fltx4 SqrtEstSIMD ( const fltx4 & a ) // sqrt(a), more or less
{
// This is emulated from rsqrt
return XMVectorSqrtEst ( a ) ;
}
FORCEINLINE fltx4 SqrtSIMD ( const fltx4 & a ) // sqrt(a)
{
// This is emulated from rsqrt
return XMVectorSqrt ( a ) ;
}
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD ( const fltx4 & a ) // 1/sqrt(a), more or less
{
return __vrsqrtefp ( a ) ;
}
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD ( const fltx4 & a )
{
// Convert zeros to epsilons
fltx4 zero_mask = CmpEqSIMD ( a , Four_Zeros ) ;
fltx4 a_safe = OrSIMD ( a , AndSIMD ( Four_Epsilons , zero_mask ) ) ;
return ReciprocalSqrtEstSIMD ( a_safe ) ;
}
FORCEINLINE fltx4 ReciprocalSqrtSIMD ( const fltx4 & a ) // 1/sqrt(a)
{
// This uses Newton-Raphson to improve the HW result
return XMVectorReciprocalSqrt ( a ) ;
}
FORCEINLINE fltx4 ReciprocalEstSIMD ( const fltx4 & a ) // 1/a, more or less
{
return __vrefp ( a ) ;
}
/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
/// No error checking!
FORCEINLINE fltx4 ReciprocalSIMD ( const fltx4 & a ) // 1/a
{
// This uses Newton-Raphson to improve the HW result
return XMVectorReciprocal ( a ) ;
}
FORCEINLINE fltx4 DivSIMD ( const fltx4 & a , const fltx4 & b ) // a/b
{
return MulSIMD ( ReciprocalSIMD ( b ) , a ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 DivEstSIMD ( const fltx4 & a , const fltx4 & b ) // Est(a/b)
{
return MulSIMD ( ReciprocalEstSIMD ( b ) , a ) ;
}
2022-07-06 21:11:32 +02:00
/// 1/x for all 4 values.
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD ( const fltx4 & a )
{
// Convert zeros to epsilons
fltx4 zero_mask = CmpEqSIMD ( a , Four_Zeros ) ;
fltx4 a_safe = OrSIMD ( a , AndSIMD ( Four_Epsilons , zero_mask ) ) ;
return ReciprocalEstSIMD ( a_safe ) ;
}
FORCEINLINE fltx4 ReciprocalSaturateSIMD ( const fltx4 & a )
{
// Convert zeros to epsilons
fltx4 zero_mask = CmpEqSIMD ( a , Four_Zeros ) ;
fltx4 a_safe = OrSIMD ( a , AndSIMD ( Four_Epsilons , zero_mask ) ) ;
return ReciprocalSIMD ( a_safe ) ;
// FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
// fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a );
// fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask );
// return ReciprocalSIMD( a_safe );
}
// CHRISG: is it worth doing integer bitfiddling for this?
// 2^x for all values (the antilog)
FORCEINLINE fltx4 ExpSIMD ( const fltx4 & toPower )
{
return XMVectorExp ( toPower ) ;
}
2022-07-08 00:55:01 +02:00
// a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point",
// which is accurate to four bits of mantissa.
FORCEINLINE fltx4 Exp2EstSIMD ( const fltx4 & f )
{
return XMVectorExpEst ( f ) ;
}
2022-07-06 21:11:32 +02:00
// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD ( FLTX4 in , FLTX4 min , FLTX4 max )
{
return XMVectorClamp ( in , min , max ) ;
}
FORCEINLINE fltx4 LoadUnalignedSIMD ( const void * pSIMD )
{
return XMLoadVector4 ( pSIMD ) ;
}
// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
FORCEINLINE fltx4 LoadUnaligned3SIMD ( const void * pSIMD )
{
return XMLoadVector3 ( pSIMD ) ;
}
2022-07-08 00:55:01 +02:00
// load a single unaligned float into the x component of a SIMD word
FORCEINLINE fltx4 LoadUnalignedFloatSIMD ( const float * pFlt )
{
return __lvlx ( pFlt , 0 ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 LoadAlignedSIMD ( const void * pSIMD )
{
return * ( reinterpret_cast < const fltx4 * > ( pSIMD ) ) ;
}
// for the transitional class -- load a 3-by VectorAligned and squash its w component
FORCEINLINE fltx4 LoadAlignedSIMD ( const VectorAligned & pSIMD )
{
fltx4 out = XMLoadVector3A ( pSIMD . Base ( ) ) ;
// squelch w
return __vrlimi ( out , __vzero ( ) , 1 , 0 ) ;
}
// for the transitional class -- load a 3-by VectorAligned and squash its w component
FORCEINLINE fltx4 LoadAlignedSIMD ( const VectorAligned * RESTRICT pSIMD )
{
fltx4 out = XMLoadVector3A ( pSIMD ) ;
// squelch w
return __vrlimi ( out , __vzero ( ) , 1 , 0 ) ;
}
FORCEINLINE void StoreAlignedSIMD ( float * pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < fltx4 * > ( pSIMD ) ) = a ;
}
FORCEINLINE void StoreUnalignedSIMD ( float * pSIMD , const fltx4 & a )
{
XMStoreVector4 ( pSIMD , a ) ;
}
FORCEINLINE void StoreUnaligned3SIMD ( float * pSIMD , const fltx4 & a )
{
XMStoreVector3 ( pSIMD , a ) ;
}
// strongly typed -- for typechecking as we transition to SIMD
FORCEINLINE void StoreAligned3SIMD ( VectorAligned * RESTRICT pSIMD , const fltx4 & a )
{
XMStoreVector3A ( pSIMD - > Base ( ) , a ) ;
}
2022-07-08 00:55:01 +02:00
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination[0], pDestination[1], pDestination[2], pDestination[3]
// The Vectors are assumed to be unaligned.
FORCEINLINE void StoreFourUnalignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
2023-01-31 14:31:17 +01:00
Vector3D * const pDestination )
2022-07-08 00:55:01 +02:00
{
// since four Vec3s == 48 bytes, we can use full-vector stores here, so long as
// we arrange the data properly first.
// The vrlimi ops trash the destination param which is why we require
// pass-by-copy. I'm counting on the compiler to schedule these properly.
b = __vrlimi ( b , b , 15 , 1 ) ; // b = y1z1__x1
c = __vrlimi ( c , c , 15 , 2 ) ; // c = z2__x2y2
a = __vrlimi ( a , b , 1 , 0 ) ; // a = x0y0z0x1
b = __vrlimi ( b , c , 2 | 1 , 0 ) ; // b = y1z1x2y2
c = __vrlimi ( c , d , 4 | 2 | 1 , 3 ) ; // c = z2x3y3z3
float * RESTRICT pOut = pDestination - > Base ( ) ;
StoreUnalignedSIMD ( pOut + 0 , a ) ;
StoreUnalignedSIMD ( pOut + 4 , b ) ;
StoreUnalignedSIMD ( pOut + 8 , c ) ;
}
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination , pDestination + 1, pDestination + 2, pDestination + 3
// The Vectors are assumed to start on an ALIGNED address, that is,
2022-09-09 19:47:31 +02:00
// pDestination is 16-byte aligned (though obviously pDestination+1 is not).
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreFourAlignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
2023-01-31 14:31:17 +01:00
Vector3D * const pDestination )
2022-07-08 00:55:01 +02:00
{
// since four Vec3s == 48 bytes, we can use full-vector stores here, so long as
// we arrange the data properly first.
// The vrlimi ops trash the destination param which is why we require
// pass-by-copy. I'm counting on the compiler to schedule these properly.
b = __vrlimi ( b , b , 15 , 1 ) ; // b = y1z1__x1
c = __vrlimi ( c , c , 15 , 2 ) ; // c = z2__x2y2
a = __vrlimi ( a , b , 1 , 0 ) ; // a = x0y0z0x1
b = __vrlimi ( b , c , 2 | 1 , 0 ) ; // b = y1z1x2y2
c = __vrlimi ( c , d , 4 | 2 | 1 , 3 ) ; // c = z2x3y3z3
float * RESTRICT pOut = pDestination - > Base ( ) ;
StoreAlignedSIMD ( pOut + 0 , a ) ;
StoreAlignedSIMD ( pOut + 4 , b ) ;
StoreAlignedSIMD ( pOut + 8 , c ) ;
}
2022-07-06 21:11:32 +02:00
// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD ( intx4 * RESTRICT pDest , const fltx4 & vSrc )
{
fltx4 asInt = __vctsxs ( vSrc , 0 ) ;
XMStoreVector4A ( pDest - > Base ( ) , asInt ) ;
}
FORCEINLINE void TransposeSIMD ( fltx4 & x , fltx4 & y , fltx4 & z , fltx4 & w )
{
XMMATRIX xyzwMatrix = _XMMATRIX ( x , y , z , w ) ;
xyzwMatrix = XMMatrixTranspose ( xyzwMatrix ) ;
x = xyzwMatrix . r [ 0 ] ;
y = xyzwMatrix . r [ 1 ] ;
z = xyzwMatrix . r [ 2 ] ;
w = xyzwMatrix . r [ 3 ] ;
}
// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD ( void )
{
return XMVectorZero ( ) ;
}
// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD ( void )
{
return XMVectorSplatOne ( ) ;
}
FORCEINLINE fltx4 SplatXSIMD ( fltx4 a )
{
return XMVectorSplatX ( a ) ;
}
FORCEINLINE fltx4 SplatYSIMD ( fltx4 a )
{
return XMVectorSplatY ( a ) ;
}
FORCEINLINE fltx4 SplatZSIMD ( fltx4 a )
{
return XMVectorSplatZ ( a ) ;
}
FORCEINLINE fltx4 SplatWSIMD ( fltx4 a )
{
return XMVectorSplatW ( a ) ;
}
FORCEINLINE fltx4 SetXSIMD ( const fltx4 & a , const fltx4 & x )
{
fltx4 result = __vrlimi ( a , x , 8 , 0 ) ;
return result ;
}
FORCEINLINE fltx4 SetYSIMD ( const fltx4 & a , const fltx4 & y )
{
fltx4 result = __vrlimi ( a , y , 4 , 0 ) ;
return result ;
}
FORCEINLINE fltx4 SetZSIMD ( const fltx4 & a , const fltx4 & z )
{
fltx4 result = __vrlimi ( a , z , 2 , 0 ) ;
return result ;
}
FORCEINLINE fltx4 SetWSIMD ( const fltx4 & a , const fltx4 & w )
{
fltx4 result = __vrlimi ( a , w , 1 , 0 ) ;
return result ;
}
FORCEINLINE fltx4 SetComponentSIMD ( const fltx4 & a , int nComponent , float flValue )
{
static int s_nVrlimiMask [ 4 ] = { 8 , 4 , 2 , 1 } ;
fltx4 val = ReplicateX4 ( flValue ) ;
fltx4 result = __vrlimi ( a , val , s_nVrlimiMask [ nComponent ] , 0 ) ;
return result ;
}
FORCEINLINE fltx4 RotateLeft ( const fltx4 & a )
{
fltx4 compareOne = a ;
return __vrlimi ( compareOne , a , 8 | 4 | 2 | 1 , 1 ) ;
}
FORCEINLINE fltx4 RotateLeft2 ( const fltx4 & a )
{
fltx4 compareOne = a ;
return __vrlimi ( compareOne , a , 8 | 4 | 2 | 1 , 2 ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 RotateRight ( const fltx4 & a )
{
fltx4 compareOne = a ;
return __vrlimi ( compareOne , a , 8 | 4 | 2 | 1 , 3 ) ;
}
FORCEINLINE fltx4 RotateRight2 ( const fltx4 & a )
{
fltx4 compareOne = a ;
return __vrlimi ( compareOne , a , 8 | 4 | 2 | 1 , 2 ) ;
}
// rotate a vector left by an arbitrary number of
// bits known at compile time. The bit parameter
// is template because it's actually used as an
// immediate field in an instruction, eg it absolutely
// must be known at compile time. nBits>127 leads
// to doom.
// zeroes are shifted in from the right
template < uint nBits >
FORCEINLINE fltx4 ShiftLeftByBits ( const fltx4 & a )
{
// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
if ( nBits > = 128 ) // WTF are you doing?!
{
return LoadZeroSIMD ( ) ;
}
else if ( nBits = = 0 )
{
return a ;
}
else if ( ( nBits > 7 ) ) // if we have to rotate by at least one byte, do the by-octet rotation first
{
fltx4 t = __vsldoi ( a , ( LoadZeroSIMD ( ) ) , ( nBits > > 3 ) ) ; // rotated left by octets
return ShiftLeftByBits < ( nBits & 0x7 ) > ( t ) ;
}
else // we need to rotate by <= 7 bits
{
// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
// the splat, however, does require an immediate. Go IBM!
u32x4 shifter = u32x4 ( __vspltisb ( ( ( signed char ) ( nBits & 0x7 ) ) ) ) ;
return __vsl ( a , shifter ) ;
}
}
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
// as above, but shift right
template < uint nBits >
FORCEINLINE fltx4 ShiftRightByBits ( const fltx4 & a )
{
// hopefully the compiler, seeing nBits as a const immediate, elides these ifs
if ( nBits > = 128 ) // WTF are you doing?!
{
return LoadZeroSIMD ( ) ;
}
else if ( nBits = = 0 )
{
return a ;
}
else if ( ( nBits > 7 ) ) // if we have to rotate by at least one byte, do the by-octet rotation first
{
fltx4 t = __vsldoi ( ( LoadZeroSIMD ( ) ) , a , 16 - ( nBits > > 3 ) ) ; // rotated right by octets -- a rotate right of one is like a rotate left of fifteen.
return ShiftRightByBits < ( nBits & 0x7 ) > ( t ) ;
}
else // we need to rotate by <= 7 bits
{
// on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
// the splat, however, does require an immediate. Go IBM!
u32x4 shifter = u32x4 ( __vspltisb ( ( ( signed char ) ( nBits & 0x7 ) ) ) ) ;
return __vsr ( a , shifter ) ;
}
}
2022-07-06 21:11:32 +02:00
// find the lowest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Though this is only five instructions long,
// they are all dependent, making this stall city.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindLowestSIMD3 ( const fltx4 & a )
{
// a is [x,y,z,G] (where G is garbage)
// rotate left by one
fltx4 compareOne = a ;
compareOne = __vrlimi ( compareOne , a , 8 | 4 , 1 ) ;
// compareOne is [y,z,G,G]
fltx4 retval = MinSIMD ( a , compareOne ) ;
// retVal is [min(x,y), min(y,z), G, G]
compareOne = __vrlimi ( compareOne , a , 8 , 2 ) ;
// compareOne is [z, G, G, G]
retval = MinSIMD ( retval , compareOne ) ;
// retVal = [ min(min(x,y),z), G, G, G ]
// splat the x component out to the whole vector and return
return SplatXSIMD ( retval ) ;
}
// find the highest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Though this is only five instructions long,
// they are all dependent, making this stall city.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindHighestSIMD3 ( const fltx4 & a )
{
// a is [x,y,z,G] (where G is garbage)
// rotate left by one
fltx4 compareOne = a ;
compareOne = __vrlimi ( compareOne , a , 8 | 4 , 1 ) ;
// compareOne is [y,z,G,G]
fltx4 retval = MaxSIMD ( a , compareOne ) ;
// retVal is [max(x,y), max(y,z), G, G]
compareOne = __vrlimi ( compareOne , a , 8 , 2 ) ;
// compareOne is [z, G, G, G]
retval = MaxSIMD ( retval , compareOne ) ;
// retVal = [ max(max(x,y),z), G, G, G ]
// splat the x component out to the whole vector and return
return SplatXSIMD ( retval ) ;
}
// Transform many (horizontal) points in-place by a 3x4 matrix,
// here already loaded onto three fltx4 registers.
// The points must be stored as 16-byte aligned. They are points
// and not vectors because we assume the w-component to be 1.
// To spare yourself the annoyance of loading the matrix yourself,
// use one of the overloads below.
void TransformManyPointsBy ( VectorAligned * RESTRICT pVectors , unsigned int numVectors , FLTX4 mRow1 , FLTX4 mRow2 , FLTX4 mRow3 ) ;
// Transform many (horizontal) points in-place by a 3x4 matrix.
// The points must be stored as 16-byte aligned. They are points
// and not vectors because we assume the w-component to be 1.
// In this function, the matrix need not be aligned.
FORCEINLINE void TransformManyPointsBy ( VectorAligned * RESTRICT pVectors , unsigned int numVectors , const matrix3x4_t & pMatrix )
{
return TransformManyPointsBy ( pVectors , numVectors ,
LoadUnalignedSIMD ( pMatrix [ 0 ] ) , LoadUnalignedSIMD ( pMatrix [ 1 ] ) , LoadUnalignedSIMD ( pMatrix [ 2 ] ) ) ;
}
// Transform many (horizontal) points in-place by a 3x4 matrix.
// The points must be stored as 16-byte aligned. They are points
// and not vectors because we assume the w-component to be 1.
// In this function, the matrix must itself be aligned on a 16-byte
// boundary.
FORCEINLINE void TransformManyPointsByA ( VectorAligned * RESTRICT pVectors , unsigned int numVectors , const matrix3x4_t & pMatrix )
{
return TransformManyPointsBy ( pVectors , numVectors ,
LoadAlignedSIMD ( pMatrix [ 0 ] ) , LoadAlignedSIMD ( pMatrix [ 1 ] ) , LoadAlignedSIMD ( pMatrix [ 2 ] ) ) ;
}
// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------
// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD ( const void * RESTRICT pSIMD )
{
return XMLoadVector4A ( pSIMD ) ;
}
// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD ( const void * RESTRICT pSIMD )
{
return XMLoadVector4 ( pSIMD ) ;
}
// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD ( int32 * pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < i32x4 * > ( pSIMD ) ) = a ;
}
FORCEINLINE void StoreAlignedIntSIMD ( intx4 & pSIMD , const fltx4 & a )
{
* ( reinterpret_cast < i32x4 * > ( pSIMD . Base ( ) ) ) = a ;
}
FORCEINLINE void StoreUnalignedIntSIMD ( int32 * pSIMD , const fltx4 & a )
{
XMStoreVector4 ( pSIMD , a ) ;
}
2022-07-08 00:55:01 +02:00
// Load four consecutive uint16's, and turn them into floating point numbers.
// This function isn't especially fast and could be made faster if anyone is
// using it heavily.
FORCEINLINE fltx4 LoadAndConvertUint16SIMD ( const uint16 * pInts )
{
return XMLoadUShort4 ( reinterpret_cast < const XMUSHORT4 * > ( pInts ) ) ;
}
// a={ a.x, a.z, b.x, b.z }
// combine two fltx4s by throwing away every other field.
FORCEINLINE fltx4 CompressSIMD ( fltx4 const & a , fltx4 const & b )
{
return XMVectorPermute ( a , b , XMVectorPermuteControl ( 0 , 2 , 4 , 6 ) ) ;
}
// a={ a.x, b.x, c.x, d.x }
// combine 4 fltx4s by throwing away 3/4s of the fields
// TODO: make more efficient by doing this in a parallel way at the caller
// Compress4SIMD(FourVectors.. )
FORCEINLINE fltx4 Compress4SIMD ( fltx4 const a , fltx4 const & b , fltx4 const & c , fltx4 const & d )
{
fltx4 abcd = __vrlimi ( a , b , 4 , 3 ) ; // a.x, b.x, a.z, a.w
abcd = __vrlimi ( abcd , c , 2 , 2 ) ; // ax, bx, cx, aw
abcd = __vrlimi ( abcd , d , 1 , 1 ) ; // ax, bx, cx, dx
return abcd ;
}
// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
FORCEINLINE fltx4 LoadGatherSIMD ( const float & x , const float & y , const float & z , const float & w )
{
// load the float into the low word of each vector register (this exploits the unaligned load op)
fltx4 vx = __lvlx ( & x , 0 ) ;
fltx4 vy = __lvlx ( & y , 0 ) ;
fltx4 vz = __lvlx ( & z , 0 ) ;
fltx4 vw = __lvlx ( & w , 0 ) ;
return Compress4SIMD ( vx , vy , vz , vw ) ;
}
2022-07-06 21:11:32 +02:00
// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD ( const i32x4 & vSrcA )
{
return __vcfux ( vSrcA , 0 ) ;
}
// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const i32x4 & vSrcA )
{
return __vcfsx ( vSrcA , 0 ) ;
}
// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. Each uint
// will be divided by 2^immed after conversion
// (eg, this is fixed point math).
/* as if:
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD ( const i32x4 & vSrcA , unsigned int uImmed )
{
return __vcfux ( vSrcA , uImmed ) ;
}
*/
# define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))
// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. Each int
// will be divided by 2^immed (eg, this is fixed point
// math).
/* as if:
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const i32x4 & vSrcA , unsigned int uImmed )
{
return __vcfsx ( vSrcA , uImmed ) ;
}
*/
# define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))
// set all components of a vector to a signed immediate int number.
/* as if:
FORCEINLINE fltx4 IntSetImmediateSIMD ( int toImmediate )
{
return __vspltisw ( toImmediate ) ;
}
*/
# define IntSetImmediateSIMD(x) (__vspltisw(x))
/*
works on fltx4 ' s as if they are four uints .
the first parameter contains the words to be shifted ,
the second contains the amount to shift by AS INTS
for i = 0 to 3
shift = vSrcB_i * 32 : ( i * 32 ) + 4
vReturned_i * 32 : ( i * 32 ) + 31 = vSrcA_i * 32 : ( i * 32 ) + 31 < < shift
*/
FORCEINLINE fltx4 IntShiftLeftWordSIMD ( fltx4 vSrcA , fltx4 vSrcB )
{
return __vslw ( vSrcA , vSrcB ) ;
}
FORCEINLINE float SubFloat ( const fltx4 & a , int idx )
{
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
const fltx4_union & a_union = ( const fltx4_union & ) a ;
return a_union . m128_f32 [ idx ] ;
}
FORCEINLINE float & SubFloat ( fltx4 & a , int idx )
{
fltx4_union & a_union = ( fltx4_union & ) a ;
return a_union . m128_f32 [ idx ] ;
}
FORCEINLINE uint32 SubFloatConvertToInt ( const fltx4 & a , int idx )
{
fltx4 t = __vctuxs ( a , 0 ) ;
const fltx4_union & a_union = ( const fltx4_union & ) t ;
return a_union . m128_u32 [ idx ] ;
}
FORCEINLINE uint32 SubInt ( const fltx4 & a , int idx )
{
const fltx4_union & a_union = ( const fltx4_union & ) a ;
return a_union . m128_u32 [ idx ] ;
}
FORCEINLINE uint32 & SubInt ( fltx4 & a , int idx )
{
fltx4_union & a_union = ( fltx4_union & ) a ;
return a_union . m128_u32 [ idx ] ;
}
# else
//---------------------------------------------------------------------
// Intel/SSE implementation
//---------------------------------------------------------------------
FORCEINLINE void StoreAlignedSIMD ( float * RESTRICT pSIMD , const fltx4 & a )
{
_mm_store_ps ( pSIMD , a ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreAlignedSIMD ( short * RESTRICT pSIMD , const shortx8 & a )
{
_mm_store_si128 ( ( shortx8 * ) pSIMD , a ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE void StoreUnalignedSIMD ( float * RESTRICT pSIMD , const fltx4 & a )
{
_mm_storeu_ps ( pSIMD , a ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreUnalignedSIMD ( short * RESTRICT pSIMD , const shortx8 & a )
{
_mm_storeu_si128 ( ( shortx8 * ) pSIMD , a ) ;
}
FORCEINLINE void StoreUnalignedFloat ( float * pSingleFloat , const fltx4 & a )
{
_mm_store_ss ( pSingleFloat , a ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 RotateLeft ( const fltx4 & a ) ;
FORCEINLINE fltx4 RotateLeft2 ( const fltx4 & a ) ;
FORCEINLINE void StoreUnaligned3SIMD ( float * pSIMD , const fltx4 & a )
{
_mm_store_ss ( pSIMD , a ) ;
_mm_store_ss ( pSIMD + 1 , RotateLeft ( a ) ) ;
_mm_store_ss ( pSIMD + 2 , RotateLeft2 ( a ) ) ;
}
2022-07-08 00:55:01 +02:00
2022-07-06 21:11:32 +02:00
// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
FORCEINLINE void StoreAligned3SIMD ( VectorAligned * RESTRICT pSIMD , const fltx4 & a )
{
StoreAlignedSIMD ( pSIMD - > Base ( ) , a ) ;
}
2022-07-08 00:55:01 +02:00
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination[0], pDestination[1], pDestination[2], pDestination[3]
// The Vectors are assumed to be unaligned.
FORCEINLINE void StoreFourUnalignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
Vector3D * const pDestination )
{
StoreUnaligned3SIMD ( pDestination - > Base ( ) , a ) ;
StoreUnaligned3SIMD ( ( pDestination + 1 ) - > Base ( ) , b ) ;
StoreUnaligned3SIMD ( ( pDestination + 2 ) - > Base ( ) , c ) ;
StoreUnaligned3SIMD ( ( pDestination + 3 ) - > Base ( ) , d ) ;
}
// Store the x,y,z components of the four FLTX4 parameters
// into the four consecutive Vectors:
// pDestination , pDestination + 1, pDestination + 2, pDestination + 3
// The Vectors are assumed to start on an ALIGNED address, that is,
2022-09-09 19:47:31 +02:00
// pDestination is 16-byte aligned (though obviously pDestination+1 is not).
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreFourAlignedVector3SIMD ( fltx4 a , fltx4 b , fltx4 c , FLTX4 d , // first three passed by copy (deliberate)
Vector3D * const pDestination )
{
StoreUnaligned3SIMD ( pDestination - > Base ( ) , a ) ;
StoreUnaligned3SIMD ( ( pDestination + 1 ) - > Base ( ) , b ) ;
StoreUnaligned3SIMD ( ( pDestination + 2 ) - > Base ( ) , c ) ;
StoreUnaligned3SIMD ( ( pDestination + 3 ) - > Base ( ) , d ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 LoadAlignedSIMD ( const void * pSIMD )
{
return _mm_load_ps ( reinterpret_cast < const float * > ( pSIMD ) ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE shortx8 LoadAlignedShortSIMD ( const void * pSIMD )
{
return _mm_load_si128 ( reinterpret_cast < const shortx8 * > ( pSIMD ) ) ;
}
FORCEINLINE shortx8 LoadUnalignedShortSIMD ( const void * pSIMD )
{
return _mm_loadu_si128 ( reinterpret_cast < const shortx8 * > ( pSIMD ) ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 AndSIMD ( const fltx4 & a , const fltx4 & b ) // a & b
{
return _mm_and_ps ( a , b ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 AndNotSIMD ( const fltx4 & a , const fltx4 & b ) // a & ~b
2022-07-06 21:11:32 +02:00
{
return _mm_andnot_ps ( a , b ) ;
}
FORCEINLINE fltx4 XorSIMD ( const fltx4 & a , const fltx4 & b ) // a ^ b
{
return _mm_xor_ps ( a , b ) ;
}
FORCEINLINE fltx4 OrSIMD ( const fltx4 & a , const fltx4 & b ) // a | b
{
return _mm_or_ps ( a , b ) ;
}
// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD ( const fltx4 & a )
{
return AndSIMD ( a , LoadAlignedSIMD ( g_SIMD_clear_wmask ) ) ;
}
// for the transitional class -- load a 3-by VectorAligned and squash its w component
FORCEINLINE fltx4 LoadAlignedSIMD ( const VectorAligned & pSIMD )
{
return SetWToZeroSIMD ( LoadAlignedSIMD ( pSIMD . Base ( ) ) ) ;
}
FORCEINLINE fltx4 LoadUnalignedSIMD ( const void * pSIMD )
{
return _mm_loadu_ps ( reinterpret_cast < const float * > ( pSIMD ) ) ;
}
FORCEINLINE fltx4 LoadUnaligned3SIMD ( const void * pSIMD )
{
return _mm_loadu_ps ( reinterpret_cast < const float * > ( pSIMD ) ) ;
}
2022-07-08 00:55:01 +02:00
// load a single unaligned float into the x component of a SIMD word
FORCEINLINE fltx4 LoadUnalignedFloatSIMD ( const float * pFlt )
{
return _mm_load_ss ( pFlt ) ;
}
2022-07-06 21:11:32 +02:00
/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE fltx4 ReplicateIX4 ( int i )
{
2023-03-28 21:18:11 +02:00
fltx4 value = _mm_set_ss ( * ( ( float * ) & i ) ) ;
2022-07-06 21:11:32 +02:00
return _mm_shuffle_ps ( value , value , 0 ) ;
}
FORCEINLINE fltx4 ReplicateX4 ( float flValue )
{
__m128 value = _mm_set_ss ( flValue ) ;
return _mm_shuffle_ps ( value , value , 0 ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 ReplicateX4 ( const float * flValue )
{
__m128 value = _mm_set_ss ( * flValue ) ;
return _mm_shuffle_ps ( value , value , 0 ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE float SubFloat ( const fltx4 & a , int idx )
{
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
2023-04-01 21:10:49 +02:00
# ifndef POSIX_MATH
2022-07-06 21:11:32 +02:00
return a . m128_f32 [ idx ] ;
# else
return ( reinterpret_cast < float const * > ( & a ) ) [ idx ] ;
# endif
}
FORCEINLINE float & SubFloat ( fltx4 & a , int idx )
{
2023-04-01 21:10:49 +02:00
# ifndef POSIX_MATH
2022-07-06 21:11:32 +02:00
return a . m128_f32 [ idx ] ;
# else
return ( reinterpret_cast < float * > ( & a ) ) [ idx ] ;
# endif
}
FORCEINLINE uint32 SubFloatConvertToInt ( const fltx4 & a , int idx )
{
return ( uint32 ) SubFloat ( a , idx ) ;
}
FORCEINLINE uint32 SubInt ( const fltx4 & a , int idx )
{
2023-04-01 21:10:49 +02:00
# ifndef POSIX_MATH
2022-07-06 21:11:32 +02:00
return a . m128_u32 [ idx ] ;
# else
return ( reinterpret_cast < uint32 const * > ( & a ) ) [ idx ] ;
# endif
}
FORCEINLINE uint32 & SubInt ( fltx4 & a , int idx )
{
2023-04-01 21:10:49 +02:00
# ifndef POSIX_MATH
2022-07-06 21:11:32 +02:00
return a . m128_u32 [ idx ] ;
# else
return ( reinterpret_cast < uint32 * > ( & a ) ) [ idx ] ;
# endif
}
// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD ( void )
{
return Four_Zeros ;
}
// Return one in the fastest way -- on the x360, faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD ( void )
{
return Four_Ones ;
}
FORCEINLINE fltx4 MaskedAssign ( const fltx4 & ReplacementMask , const fltx4 & NewValue , const fltx4 & OldValue )
{
return OrSIMD (
AndSIMD ( ReplacementMask , NewValue ) ,
AndNotSIMD ( ReplacementMask , OldValue ) ) ;
}
// remember, the SSE numbers its words 3 2 1 0
// The way we want to specify shuffles is backwards from the default
// MM_SHUFFLE_REV is in array index order (default is reversed)
# define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a)
FORCEINLINE fltx4 SplatXSIMD ( fltx4 const & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 0 , 0 , 0 , 0 ) ) ;
}
FORCEINLINE fltx4 SplatYSIMD ( fltx4 const & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 1 , 1 , 1 , 1 ) ) ;
}
FORCEINLINE fltx4 SplatZSIMD ( fltx4 const & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 2 , 2 , 2 , 2 ) ) ;
}
FORCEINLINE fltx4 SplatWSIMD ( fltx4 const & a )
{
2022-07-08 00:55:01 +02:00
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 3 , 3 , 3 , 3 ) ) ;
}
FORCEINLINE fltx4 ShuffleXXYY ( const fltx4 & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 0 , 0 , 1 , 1 ) ) ;
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 ShuffleXYXY ( const fltx4 & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 0 , 1 , 0 , 1 ) ) ;
}
FORCEINLINE fltx4 ShuffleZZWW ( const fltx4 & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 2 , 2 , 3 , 3 ) ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 SetXSIMD ( const fltx4 & a , const fltx4 & x )
{
fltx4 result = MaskedAssign ( LoadAlignedSIMD ( g_SIMD_ComponentMask [ 0 ] ) , x , a ) ;
return result ;
}
FORCEINLINE fltx4 SetYSIMD ( const fltx4 & a , const fltx4 & y )
{
fltx4 result = MaskedAssign ( LoadAlignedSIMD ( g_SIMD_ComponentMask [ 1 ] ) , y , a ) ;
return result ;
}
FORCEINLINE fltx4 SetZSIMD ( const fltx4 & a , const fltx4 & z )
{
fltx4 result = MaskedAssign ( LoadAlignedSIMD ( g_SIMD_ComponentMask [ 2 ] ) , z , a ) ;
return result ;
}
FORCEINLINE fltx4 SetWSIMD ( const fltx4 & a , const fltx4 & w )
{
fltx4 result = MaskedAssign ( LoadAlignedSIMD ( g_SIMD_ComponentMask [ 3 ] ) , w , a ) ;
return result ;
}
FORCEINLINE fltx4 SetComponentSIMD ( const fltx4 & a , int nComponent , float flValue )
{
fltx4 val = ReplicateX4 ( flValue ) ;
fltx4 result = MaskedAssign ( LoadAlignedSIMD ( g_SIMD_ComponentMask [ nComponent ] ) , val , a ) ;
return result ;
}
// a b c d -> b c d a
FORCEINLINE fltx4 RotateLeft ( const fltx4 & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 1 , 2 , 3 , 0 ) ) ;
}
// a b c d -> c d a b
FORCEINLINE fltx4 RotateLeft2 ( const fltx4 & a )
{
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 2 , 3 , 0 , 1 ) ) ;
}
// a b c d -> d a b c
FORCEINLINE fltx4 RotateRight ( const fltx4 & a )
{
2022-07-08 00:55:01 +02:00
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 3 , 0 , 1 , 2 ) ) ;
2022-07-06 21:11:32 +02:00
}
// a b c d -> c d a b
FORCEINLINE fltx4 RotateRight2 ( const fltx4 & a )
{
2022-07-08 00:55:01 +02:00
return _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 2 , 3 , 0 , 1 ) ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 AddSIMD ( const fltx4 & a , const fltx4 & b ) // a+b
{
return _mm_add_ps ( a , b ) ;
2022-07-08 00:55:01 +02:00
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 SubSIMD ( const fltx4 & a , const fltx4 & b ) // a-b
{
return _mm_sub_ps ( a , b ) ;
} ;
FORCEINLINE fltx4 MulSIMD ( const fltx4 & a , const fltx4 & b ) // a*b
{
return _mm_mul_ps ( a , b ) ;
} ;
FORCEINLINE fltx4 DivSIMD ( const fltx4 & a , const fltx4 & b ) // a/b
{
return _mm_div_ps ( a , b ) ;
} ;
2022-07-08 00:55:01 +02:00
fltx4 ReciprocalEstSIMD ( const fltx4 & a ) ;
FORCEINLINE fltx4 DivEstSIMD ( const fltx4 & a , const fltx4 & b ) // Est(a/b)
{
return MulSIMD ( ReciprocalEstSIMD ( b ) , a ) ;
} ;
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 MaddSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // a*b + c
{
return AddSIMD ( MulSIMD ( a , b ) , c ) ;
}
FORCEINLINE fltx4 MsubSIMD ( const fltx4 & a , const fltx4 & b , const fltx4 & c ) // c - a*b
{
return SubSIMD ( c , MulSIMD ( a , b ) ) ;
} ;
FORCEINLINE fltx4 Dot3SIMD ( const fltx4 & a , const fltx4 & b )
{
fltx4 m = MulSIMD ( a , b ) ;
2022-07-08 00:55:01 +02:00
return AddSIMD ( AddSIMD ( SplatXSIMD ( m ) , SplatYSIMD ( m ) ) , SplatZSIMD ( m ) ) ;
2022-07-06 21:11:32 +02:00
}
FORCEINLINE fltx4 Dot4SIMD ( const fltx4 & a , const fltx4 & b )
{
2022-07-08 00:55:01 +02:00
// 4 instructions, serial, order of addition varies so individual elements my differ in the LSB on some CPUs
fltx4 fl4Product = MulSIMD ( a , b ) ;
fltx4 fl4YXWZ = _mm_shuffle_ps ( fl4Product , fl4Product , MM_SHUFFLE_REV ( 1 , 0 , 3 , 2 ) ) ;
fltx4 fl4UUVV = AddSIMD ( fl4Product , fl4YXWZ ) ; // U = X+Y; V = Z+W
fltx4 fl4VVUU = RotateLeft2 ( fl4UUVV ) ;
return AddSIMD ( fl4UUVV , fl4VVUU ) ;
2022-07-06 21:11:32 +02:00
}
//TODO: implement as four-way Taylor series (see xbox implementation)
FORCEINLINE fltx4 SinSIMD ( const fltx4 & radians )
{
fltx4 result ;
SubFloat ( result , 0 ) = sin ( SubFloat ( radians , 0 ) ) ;
SubFloat ( result , 1 ) = sin ( SubFloat ( radians , 1 ) ) ;
SubFloat ( result , 2 ) = sin ( SubFloat ( radians , 2 ) ) ;
SubFloat ( result , 3 ) = sin ( SubFloat ( radians , 3 ) ) ;
return result ;
}
FORCEINLINE void SinCos3SIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians )
{
// FIXME: Make a fast SSE version
SinCos ( SubFloat ( radians , 0 ) , & SubFloat ( sine , 0 ) , & SubFloat ( cosine , 0 ) ) ;
SinCos ( SubFloat ( radians , 1 ) , & SubFloat ( sine , 1 ) , & SubFloat ( cosine , 1 ) ) ;
SinCos ( SubFloat ( radians , 2 ) , & SubFloat ( sine , 2 ) , & SubFloat ( cosine , 2 ) ) ;
}
FORCEINLINE void SinCosSIMD ( fltx4 & sine , fltx4 & cosine , const fltx4 & radians ) // a*b + c
{
// FIXME: Make a fast SSE version
SinCos ( SubFloat ( radians , 0 ) , & SubFloat ( sine , 0 ) , & SubFloat ( cosine , 0 ) ) ;
SinCos ( SubFloat ( radians , 1 ) , & SubFloat ( sine , 1 ) , & SubFloat ( cosine , 1 ) ) ;
SinCos ( SubFloat ( radians , 2 ) , & SubFloat ( sine , 2 ) , & SubFloat ( cosine , 2 ) ) ;
SinCos ( SubFloat ( radians , 3 ) , & SubFloat ( sine , 3 ) , & SubFloat ( cosine , 3 ) ) ;
}
//TODO: implement as four-way Taylor series (see xbox implementation)
FORCEINLINE fltx4 ArcSinSIMD ( const fltx4 & sine )
{
// FIXME: Make a fast SSE version
fltx4 result ;
SubFloat ( result , 0 ) = asin ( SubFloat ( sine , 0 ) ) ;
SubFloat ( result , 1 ) = asin ( SubFloat ( sine , 1 ) ) ;
SubFloat ( result , 2 ) = asin ( SubFloat ( sine , 2 ) ) ;
SubFloat ( result , 3 ) = asin ( SubFloat ( sine , 3 ) ) ;
return result ;
}
FORCEINLINE fltx4 ArcCosSIMD ( const fltx4 & cs )
{
fltx4 result ;
SubFloat ( result , 0 ) = acos ( SubFloat ( cs , 0 ) ) ;
SubFloat ( result , 1 ) = acos ( SubFloat ( cs , 1 ) ) ;
SubFloat ( result , 2 ) = acos ( SubFloat ( cs , 2 ) ) ;
SubFloat ( result , 3 ) = acos ( SubFloat ( cs , 3 ) ) ;
return result ;
}
// tan^1(a/b) .. ie, pass sin in as a and cos in as b
FORCEINLINE fltx4 ArcTan2SIMD ( const fltx4 & a , const fltx4 & b )
{
fltx4 result ;
SubFloat ( result , 0 ) = atan2 ( SubFloat ( a , 0 ) , SubFloat ( b , 0 ) ) ;
SubFloat ( result , 1 ) = atan2 ( SubFloat ( a , 1 ) , SubFloat ( b , 1 ) ) ;
SubFloat ( result , 2 ) = atan2 ( SubFloat ( a , 2 ) , SubFloat ( b , 2 ) ) ;
SubFloat ( result , 3 ) = atan2 ( SubFloat ( a , 3 ) , SubFloat ( b , 3 ) ) ;
return result ;
}
FORCEINLINE fltx4 NegSIMD ( const fltx4 & a ) // negate: -a
{
return SubSIMD ( LoadZeroSIMD ( ) , a ) ;
}
FORCEINLINE int TestSignSIMD ( const fltx4 & a ) // mask of which floats have the high bit set
{
return _mm_movemask_ps ( a ) ;
}
FORCEINLINE bool IsAnyNegative ( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
return ( 0 ! = TestSignSIMD ( a ) ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE bool IsAnyTrue ( const fltx4 & a )
{
return ( 0 ! = TestSignSIMD ( a ) ) ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 CmpEqSIMD ( const fltx4 & a , const fltx4 & b ) // (a==b) ? ~0:0
{
return _mm_cmpeq_ps ( a , b ) ;
}
FORCEINLINE fltx4 CmpGtSIMD ( const fltx4 & a , const fltx4 & b ) // (a>b) ? ~0:0
{
return _mm_cmpgt_ps ( a , b ) ;
}
FORCEINLINE fltx4 CmpGeSIMD ( const fltx4 & a , const fltx4 & b ) // (a>=b) ? ~0:0
{
return _mm_cmpge_ps ( a , b ) ;
}
FORCEINLINE fltx4 CmpLtSIMD ( const fltx4 & a , const fltx4 & b ) // (a<b) ? ~0:0
{
return _mm_cmplt_ps ( a , b ) ;
}
FORCEINLINE fltx4 CmpLeSIMD ( const fltx4 & a , const fltx4 & b ) // (a<=b) ? ~0:0
{
return _mm_cmple_ps ( a , b ) ;
}
// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan ( const fltx4 & a , const fltx4 & b )
{
return TestSignSIMD ( CmpLeSIMD ( a , b ) ) = = 0 ;
}
// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq ( const fltx4 & a , const fltx4 & b )
{
return TestSignSIMD ( CmpLtSIMD ( a , b ) ) = = 0 ;
}
// For branching if all a.xyzw == b.xyzw
FORCEINLINE bool IsAllEqual ( const fltx4 & a , const fltx4 & b )
{
return TestSignSIMD ( CmpEqSIMD ( a , b ) ) = = 0xf ;
}
FORCEINLINE fltx4 CmpInBoundsSIMD ( const fltx4 & a , const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
{
return AndSIMD ( CmpLeSIMD ( a , b ) , CmpGeSIMD ( a , NegSIMD ( b ) ) ) ;
}
FORCEINLINE fltx4 MinSIMD ( const fltx4 & a , const fltx4 & b ) // min(a,b)
{
return _mm_min_ps ( a , b ) ;
}
FORCEINLINE fltx4 MaxSIMD ( const fltx4 & a , const fltx4 & b ) // max(a,b)
{
return _mm_max_ps ( a , b ) ;
}
// SSE lacks rounding operations.
// Really.
// You can emulate them by setting the rounding mode for the
// whole processor and then converting to int, and then back again.
// But every time you set the rounding mode, you clear out the
// entire pipeline. So, I can't do them per operation. You
// have to do it once, before the loop that would call these.
// Round towards positive infinity
FORCEINLINE fltx4 CeilSIMD ( const fltx4 & a )
{
fltx4 retVal ;
SubFloat ( retVal , 0 ) = ceil ( SubFloat ( a , 0 ) ) ;
SubFloat ( retVal , 1 ) = ceil ( SubFloat ( a , 1 ) ) ;
SubFloat ( retVal , 2 ) = ceil ( SubFloat ( a , 2 ) ) ;
SubFloat ( retVal , 3 ) = ceil ( SubFloat ( a , 3 ) ) ;
return retVal ;
}
2022-07-08 00:55:01 +02:00
fltx4 AbsSIMD ( const fltx4 & x ) ; // To make it more coherent with the whole API (the whole SIMD API is postfixed with SIMD except a couple of methods. Well...)
2022-07-06 21:11:32 +02:00
fltx4 fabs ( const fltx4 & x ) ;
2022-07-08 00:55:01 +02:00
2022-07-06 21:11:32 +02:00
// Round towards negative infinity
// This is the implementation that was here before; it assumes
// you are in round-to-floor mode, which I guess is usually the
// case for us vis-a-vis SSE. It's totally unnecessary on
// VMX, which has a native floor op.
FORCEINLINE fltx4 FloorSIMD ( const fltx4 & val )
{
fltx4 fl4Abs = fabs ( val ) ;
fltx4 ival = SubSIMD ( AddSIMD ( fl4Abs , Four_2ToThe23s ) , Four_2ToThe23s ) ;
ival = MaskedAssign ( CmpGtSIMD ( ival , fl4Abs ) , SubSIMD ( ival , Four_Ones ) , ival ) ;
return XorSIMD ( ival , XorSIMD ( val , fl4Abs ) ) ; // restore sign bits
}
2022-07-08 00:55:01 +02:00
FORCEINLINE bool IsAnyZeros ( const fltx4 & a ) // any floats are zero?
{
return TestSignSIMD ( CmpEqSIMD ( a , Four_Zeros ) ) ! = 0 ;
}
2022-07-06 21:11:32 +02:00
inline bool IsAllZeros ( const fltx4 & var )
{
return TestSignSIMD ( CmpEqSIMD ( var , Four_Zeros ) ) = = 0xF ;
}
FORCEINLINE fltx4 SqrtEstSIMD ( const fltx4 & a ) // sqrt(a), more or less
{
return _mm_sqrt_ps ( a ) ;
}
FORCEINLINE fltx4 SqrtSIMD ( const fltx4 & a ) // sqrt(a)
{
return _mm_sqrt_ps ( a ) ;
}
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD ( const fltx4 & a ) // 1/sqrt(a), more or less
{
return _mm_rsqrt_ps ( a ) ;
}
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD ( const fltx4 & a )
{
fltx4 zero_mask = CmpEqSIMD ( a , Four_Zeros ) ;
fltx4 ret = OrSIMD ( a , AndSIMD ( Four_Epsilons , zero_mask ) ) ;
ret = ReciprocalSqrtEstSIMD ( ret ) ;
return ret ;
}
/// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD
FORCEINLINE fltx4 ReciprocalSqrtSIMD ( const fltx4 & a ) // 1/sqrt(a)
{
fltx4 guess = ReciprocalSqrtEstSIMD ( a ) ;
// newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
guess = MulSIMD ( guess , SubSIMD ( Four_Threes , MulSIMD ( a , MulSIMD ( guess , guess ) ) ) ) ;
guess = MulSIMD ( Four_PointFives , guess ) ;
return guess ;
}
FORCEINLINE fltx4 ReciprocalEstSIMD ( const fltx4 & a ) // 1/a, more or less
{
return _mm_rcp_ps ( a ) ;
}
/// 1/x for all 4 values, more or less
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD ( const fltx4 & a )
{
fltx4 zero_mask = CmpEqSIMD ( a , Four_Zeros ) ;
fltx4 ret = OrSIMD ( a , AndSIMD ( Four_Epsilons , zero_mask ) ) ;
ret = ReciprocalEstSIMD ( ret ) ;
return ret ;
}
/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
/// No error checking!
FORCEINLINE fltx4 ReciprocalSIMD ( const fltx4 & a ) // 1/a
{
fltx4 ret = ReciprocalEstSIMD ( a ) ;
// newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
ret = SubSIMD ( AddSIMD ( ret , ret ) , MulSIMD ( a , MulSIMD ( ret , ret ) ) ) ;
return ret ;
}
/// 1/x for all 4 values.
/// 1/0 will result in a big but NOT infinite result
FORCEINLINE fltx4 ReciprocalSaturateSIMD ( const fltx4 & a )
{
fltx4 zero_mask = CmpEqSIMD ( a , Four_Zeros ) ;
fltx4 ret = OrSIMD ( a , AndSIMD ( Four_Epsilons , zero_mask ) ) ;
ret = ReciprocalSIMD ( ret ) ;
return ret ;
}
// CHRISG: is it worth doing integer bitfiddling for this?
// 2^x for all values (the antilog)
FORCEINLINE fltx4 ExpSIMD ( const fltx4 & toPower )
{
fltx4 retval ;
SubFloat ( retval , 0 ) = powf ( 2 , SubFloat ( toPower , 0 ) ) ;
SubFloat ( retval , 1 ) = powf ( 2 , SubFloat ( toPower , 1 ) ) ;
SubFloat ( retval , 2 ) = powf ( 2 , SubFloat ( toPower , 2 ) ) ;
SubFloat ( retval , 3 ) = powf ( 2 , SubFloat ( toPower , 3 ) ) ;
return retval ;
}
// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD ( FLTX4 in , FLTX4 min , FLTX4 max )
{
return MaxSIMD ( min , MinSIMD ( max , in ) ) ;
}
FORCEINLINE void TransposeSIMD ( fltx4 & x , fltx4 & y , fltx4 & z , fltx4 & w )
{
_MM_TRANSPOSE4_PS ( x , y , z , w ) ;
}
FORCEINLINE fltx4 FindLowestSIMD3 ( const fltx4 & a )
{
// a is [x,y,z,G] (where G is garbage)
// rotate left by one
fltx4 compareOne = RotateLeft ( a ) ;
// compareOne is [y,z,G,x]
fltx4 retval = MinSIMD ( a , compareOne ) ;
// retVal is [min(x,y), ... ]
compareOne = RotateLeft2 ( a ) ;
// compareOne is [z, G, x, y]
retval = MinSIMD ( retval , compareOne ) ;
// retVal = [ min(min(x,y),z)..]
// splat the x component out to the whole vector and return
return SplatXSIMD ( retval ) ;
}
FORCEINLINE fltx4 FindHighestSIMD3 ( const fltx4 & a )
{
// a is [x,y,z,G] (where G is garbage)
// rotate left by one
fltx4 compareOne = RotateLeft ( a ) ;
// compareOne is [y,z,G,x]
fltx4 retval = MaxSIMD ( a , compareOne ) ;
// retVal is [max(x,y), ... ]
compareOne = RotateLeft2 ( a ) ;
// compareOne is [z, G, x, y]
retval = MaxSIMD ( retval , compareOne ) ;
// retVal = [ max(max(x,y),z)..]
// splat the x component out to the whole vector and return
return SplatXSIMD ( retval ) ;
}
2022-07-08 00:55:01 +02:00
inline bool IsVector3LessThan ( const fltx4 & v1 , const fltx4 & v2 )
{
bi32x4 isOut = CmpLtSIMD ( v1 , v2 ) ;
return IsAnyNegative ( isOut ) ;
}
inline bool IsVector4LessThan ( const fltx4 & v1 , const fltx4 & v2 )
{
bi32x4 isOut = CmpLtSIMD ( v1 , v2 ) ;
return IsAnyNegative ( isOut ) ;
}
2022-07-06 21:11:32 +02:00
// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------
#if 0 /* pc does not have these ops */
// splat all components of a vector to a signed immediate int number.
FORCEINLINE fltx4 IntSetImmediateSIMD ( int to )
{
//CHRISG: SSE2 has this, but not SSE1. What to do?
fltx4 retval ;
SubInt ( retval , 0 ) = to ;
SubInt ( retval , 1 ) = to ;
SubInt ( retval , 2 ) = to ;
SubInt ( retval , 3 ) = to ;
return retval ;
}
# endif
// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD ( const void * RESTRICT pSIMD )
{
return _mm_load_ps ( reinterpret_cast < const float * > ( pSIMD ) ) ;
}
// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD ( const void * RESTRICT pSIMD )
{
return _mm_loadu_ps ( reinterpret_cast < const float * > ( pSIMD ) ) ;
}
// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD ( int32 * RESTRICT pSIMD , const fltx4 & a )
{
_mm_store_ps ( reinterpret_cast < float * > ( pSIMD ) , a ) ;
}
FORCEINLINE void StoreAlignedIntSIMD ( intx4 & pSIMD , const fltx4 & a )
{
_mm_store_ps ( reinterpret_cast < float * > ( pSIMD . Base ( ) ) , a ) ;
}
FORCEINLINE void StoreUnalignedIntSIMD ( int32 * RESTRICT pSIMD , const fltx4 & a )
{
_mm_storeu_ps ( reinterpret_cast < float * > ( pSIMD ) , a ) ;
}
2022-07-08 00:55:01 +02:00
// a={ a.x, a.z, b.x, b.z }
// combine two fltx4s by throwing away every other field.
FORCEINLINE fltx4 CompressSIMD ( fltx4 const & a , fltx4 const & b )
{
return _mm_shuffle_ps ( a , b , MM_SHUFFLE_REV ( 0 , 2 , 0 , 2 ) ) ;
}
// Load four consecutive uint16's, and turn them into floating point numbers.
// This function isn't especially fast and could be made faster if anyone is
// using it heavily.
FORCEINLINE fltx4 LoadAndConvertUint16SIMD ( const uint16 * pInts )
{
2023-04-01 21:10:49 +02:00
# ifdef POSIX_MATH
2022-07-08 00:55:01 +02:00
fltx4 retval ;
SubFloat ( retval , 0 ) = pInts [ 0 ] ;
SubFloat ( retval , 1 ) = pInts [ 1 ] ;
SubFloat ( retval , 2 ) = pInts [ 2 ] ;
SubFloat ( retval , 3 ) = pInts [ 3 ] ;
return retval ;
# else
__m128i inA = _mm_loadl_epi64 ( ( __m128i const * ) pInts ) ; // Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.
inA = _mm_unpacklo_epi16 ( inA , _mm_setzero_si128 ( ) ) ; // unpack unsigned 16's to signed 32's
return _mm_cvtepi32_ps ( inA ) ;
# endif
}
// a={ a.x, b.x, c.x, d.x }
// combine 4 fltx4s by throwing away 3/4s of the fields
FORCEINLINE fltx4 Compress4SIMD ( fltx4 const a , fltx4 const & b , fltx4 const & c , fltx4 const & d )
{
fltx4 aacc = _mm_shuffle_ps ( a , c , MM_SHUFFLE_REV ( 0 , 0 , 0 , 0 ) ) ;
fltx4 bbdd = _mm_shuffle_ps ( b , d , MM_SHUFFLE_REV ( 0 , 0 , 0 , 0 ) ) ;
return MaskedAssign ( LoadAlignedSIMD ( g_SIMD_EveryOtherMask ) , bbdd , aacc ) ;
}
// outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w }
FORCEINLINE void ExpandSIMD ( fltx4 const & a , fltx4 & fl4OutA , fltx4 & fl4OutB )
{
fl4OutA = _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 0 , 0 , 1 , 1 ) ) ;
fl4OutB = _mm_shuffle_ps ( a , a , MM_SHUFFLE_REV ( 2 , 2 , 3 , 3 ) ) ;
}
// construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
FORCEINLINE fltx4 LoadGatherSIMD ( const float & x , const float & y , const float & z , const float & w )
{
// load the float into the low word of each vector register (this exploits the unaligned load op)
fltx4 vx = _mm_load_ss ( & x ) ;
fltx4 vy = _mm_load_ss ( & y ) ;
fltx4 vz = _mm_load_ss ( & z ) ;
fltx4 vw = _mm_load_ss ( & w ) ;
return Compress4SIMD ( vx , vy , vz , vw ) ;
}
2022-07-06 21:11:32 +02:00
// CHRISG: the conversion functions all seem to operate on m64's only...
// how do we make them work here?
// Take a fltx4 containing fixed-point uints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD ( const u32x4 & vSrcA )
{
fltx4 retval ;
2023-04-02 17:11:14 +02:00
SubFloat ( retval , 0 ) = ( ( float ) SubInt ( vSrcA , 0 ) ) ;
SubFloat ( retval , 1 ) = ( ( float ) SubInt ( vSrcA , 1 ) ) ;
SubFloat ( retval , 2 ) = ( ( float ) SubInt ( vSrcA , 2 ) ) ;
SubFloat ( retval , 3 ) = ( ( float ) SubInt ( vSrcA , 3 ) ) ;
2022-07-06 21:11:32 +02:00
return retval ;
}
2022-07-08 00:55:01 +02:00
// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const i32x4 & vSrcA )
{
return _mm_cvtepi32_ps ( ( const __m128i & ) vSrcA ) ;
}
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const shortx8 & vSrcA )
{
return _mm_cvtepi32_ps ( vSrcA ) ;
}
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
#if 0
2022-07-06 21:11:32 +02:00
// Take a fltx4 containing fixed-point sints and
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD ( const i32x4 & vSrcA )
{
fltx4 retval ;
SubFloat ( retval , 0 ) = ( ( float ) ( reinterpret_cast < const int32 * > ( & vSrcA ) [ 0 ] ) ) ;
SubFloat ( retval , 1 ) = ( ( float ) ( reinterpret_cast < const int32 * > ( & vSrcA ) [ 1 ] ) ) ;
SubFloat ( retval , 2 ) = ( ( float ) ( reinterpret_cast < const int32 * > ( & vSrcA ) [ 2 ] ) ) ;
SubFloat ( retval , 3 ) = ( ( float ) ( reinterpret_cast < const int32 * > ( & vSrcA ) [ 3 ] ) ) ;
return retval ;
}
2022-07-08 00:55:01 +02:00
# endif
2022-07-06 21:11:32 +02:00
/*
works on fltx4 ' s as if they are four uints .
the first parameter contains the words to be shifted ,
the second contains the amount to shift by AS INTS
for i = 0 to 3
shift = vSrcB_i * 32 : ( i * 32 ) + 4
vReturned_i * 32 : ( i * 32 ) + 31 = vSrcA_i * 32 : ( i * 32 ) + 31 < < shift
*/
FORCEINLINE i32x4 IntShiftLeftWordSIMD ( const i32x4 & vSrcA , const i32x4 & vSrcB )
{
i32x4 retval ;
SubInt ( retval , 0 ) = SubInt ( vSrcA , 0 ) < < SubInt ( vSrcB , 0 ) ;
SubInt ( retval , 1 ) = SubInt ( vSrcA , 1 ) < < SubInt ( vSrcB , 1 ) ;
SubInt ( retval , 2 ) = SubInt ( vSrcA , 2 ) < < SubInt ( vSrcB , 2 ) ;
SubInt ( retval , 3 ) = SubInt ( vSrcA , 3 ) < < SubInt ( vSrcB , 3 ) ;
return retval ;
}
// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD ( intx4 * RESTRICT pDest , const fltx4 & vSrc )
{
2022-07-08 00:55:01 +02:00
# if defined(_MSC_VER) && _MSC_VER >= 1900 && defined(COMPILER_MSVC64)
( * pDest ) [ 0 ] = ( int ) SubFloat ( vSrc , 0 ) ;
( * pDest ) [ 1 ] = ( int ) SubFloat ( vSrc , 1 ) ;
( * pDest ) [ 2 ] = ( int ) SubFloat ( vSrc , 2 ) ;
( * pDest ) [ 3 ] = ( int ) SubFloat ( vSrc , 3 ) ;
2022-07-06 21:11:32 +02:00
# else
__m64 bottom = _mm_cvttps_pi32 ( vSrc ) ;
__m64 top = _mm_cvttps_pi32 ( _mm_movehl_ps ( vSrc , vSrc ) ) ;
* reinterpret_cast < __m64 * > ( & ( * pDest ) [ 0 ] ) = bottom ;
* reinterpret_cast < __m64 * > ( & ( * pDest ) [ 2 ] ) = top ;
_mm_empty ( ) ;
# endif
}
# endif
2022-07-08 00:55:01 +02:00
// a={a.y, a.z, a.w, b.x } b={b.y, b.z, b.w, b.x }
FORCEINLINE void RotateLeftDoubleSIMD ( fltx4 & a , fltx4 & b )
{
a = SetWSIMD ( RotateLeft ( a ) , SplatXSIMD ( b ) ) ;
b = RotateLeft ( b ) ;
}
// // Some convenience operator overloads, which are just aliasing the functions above.
// Unneccessary on 360, as you already have them from xboxmath.h (same for PS3 PPU and SPU)
2023-04-01 21:10:49 +02:00
# if !defined(PLATFORM_PPC) && !defined( POSIX_MATH ) && !defined(SPU)
2022-07-08 00:55:01 +02:00
# if 1 // TODO: verify generation of non-bad code.
// Componentwise add
FORCEINLINE fltx4 operator + ( FLTX4 a , FLTX4 b )
{
return AddSIMD ( a , b ) ;
}
// Componentwise subtract
FORCEINLINE fltx4 operator - ( FLTX4 a , FLTX4 b )
{
return SubSIMD ( a , b ) ;
}
// Componentwise multiply
FORCEINLINE fltx4 operator * ( FLTX4 a , FLTX4 b )
{
return MulSIMD ( a , b ) ;
}
// No divide. You need to think carefully about whether you want a reciprocal
// or a reciprocal estimate.
// bitwise and
FORCEINLINE fltx4 operator & ( FLTX4 a , FLTX4 b )
{
return AndSIMD ( a , b ) ;
}
// bitwise or
FORCEINLINE fltx4 operator | ( FLTX4 a , FLTX4 b )
{
return OrSIMD ( a , b ) ;
}
// bitwise xor
FORCEINLINE fltx4 operator ^ ( FLTX4 a , FLTX4 b )
{
return XorSIMD ( a , b ) ;
}
// unary negate
FORCEINLINE fltx4 operator - ( FLTX4 a )
{
return NegSIMD ( a ) ;
}
# endif // 0
# endif
# if defined(_X360) || defined(_PS3)
FORCEINLINE fltx4 VectorMergeHighSIMD ( fltx4 fl4SrcA , fltx4 fl4SrcB )
{
# if defined( _X360 )
return __vmrghw ( fl4SrcA , fl4SrcB ) ;
# else
return vec_mergeh ( fl4SrcA , fl4SrcB ) ;
# endif
}
FORCEINLINE fltx4 VectorMergeLowSIMD ( fltx4 fl4SrcA , fltx4 fl4SrcB )
{
# if defined( _X360 )
return __vmrglw ( fl4SrcA , fl4SrcB ) ;
# else
return vec_mergel ( fl4SrcA , fl4SrcB ) ;
# endif
}
# endif
# ifndef SPU
// fourplanes_t, Frustrum_t are not supported on SPU
// It would make sense to support FourVectors on SPU at some point.
struct ALIGN16 fourplanes_t
{
fltx4 nX ;
fltx4 nY ;
fltx4 nZ ;
fltx4 dist ;
bi32x4 xSign ;
bi32x4 ySign ;
bi32x4 zSign ;
fltx4 nXAbs ;
fltx4 nYAbs ;
fltx4 nZAbs ;
void ComputeSignbits ( ) ;
// fast SIMD loads
void Set4Planes ( const VPlane * pPlanes ) ;
void Set2Planes ( const VPlane * pPlanes ) ;
void Get4Planes ( VPlane * pPlanesOut ) const ;
void Get2Planes ( VPlane * pPlanesOut ) const ;
// not-SIMD, much slower
void GetPlane ( int index , Vector3D * pNormal , float * pDist ) const ;
void SetPlane ( int index , const Vector3D & vecNormal , float planeDist ) ;
} ;
class ALIGN16 Frustum_t
{
public :
Frustum_t ( ) ;
void SetPlane ( int i , const Vector3D & vecNormal , float dist ) ;
void GetPlane ( int i , Vector3D * pNormalOut , float * pDistOut ) const ;
void SetPlanes ( const VPlane * pPlanes ) ;
void GetPlanes ( VPlane * pPlanesOut ) const ;
// returns false if the box is within the frustum, true if it is outside
bool CullBox ( const Vector3D & mins , const Vector3D & maxs ) const ;
bool CullBoxCenterExtents ( const Vector3D & center , const Vector3D & extents ) const ;
bool CullBox ( const fltx4 & fl4Mins , const fltx4 & fl4Maxs ) const ;
bool CullBoxCenterExtents ( const fltx4 & fl4Center , const fltx4 & fl4Extents ) const ;
// Return true if frustum contains this bounding volume, false if any corner is outside
bool Contains ( const Vector3D & mins , const Vector3D & maxs ) const ;
// Return true if this frustum intersects the frustum, false if it is outside
bool Intersects ( Frustum_t & otherFrustum ) const ;
// Return true if this bounding volume intersects the frustum, false if it is outside
bool Intersects ( const Vector3D & mins , const Vector3D & maxs ) const ;
bool IntersectsCenterExtents ( const Vector3D & center , const Vector3D & extents ) const ;
bool Intersects ( const fltx4 & fl4Mins , const fltx4 & fl4Maxs ) const ;
bool IntersectsCenterExtents ( const fltx4 & fl4Center , const fltx4 & fl4Extents ) const ;
void CreatePerspectiveFrustum ( const Vector3D & origin , const Vector3D & forward ,
const Vector3D & right , const Vector3D & up , float flZNear , float flZFar ,
float flFovX , float flAspect ) ;
void CreatePerspectiveFrustumFLU ( const Vector3D & vOrigin , const Vector3D & vForward ,
const Vector3D & vLeft , const Vector3D & vUp , float flZNear , float flZFar ,
float flFovX , float flAspect ) ;
// Version that accepts angles instead of vectors
void CreatePerspectiveFrustum ( const Vector3D & origin , const QAngle & angles , float flZNear ,
float flZFar , float flFovX , float flAspectRatio ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
// Generate a frustum based on orthographic parameters
void CreateOrthoFrustum ( const Vector3D & origin , const Vector3D & forward , const Vector3D & right , const Vector3D & up ,
float flLeft , float flRight , float flBottom , float flTop , float flZNear , float flZFar ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
void CreateOrthoFrustumFLU ( const Vector3D & vOrigin , const Vector3D & vForward , const Vector3D & vLeft , const Vector3D & vUp ,
float flLeft , float flRight , float flBottom , float flTop , float flZNear , float flZFar ) ;
// The points returned correspond to the corners of the frustum faces
// Points 0 to 3 correspond to the near face
// Points 4 to 7 correspond to the far face
// Returns points in a face in this order:
// 2--3
// | |
// 0--1
// Returns false if a corner couldn't be generated for some reason.
bool GetCorners ( Vector3D * pPoints ) const ;
fourplanes_t planes [ 2 ] ;
} ;
# endif
class FourQuaternions ;
2022-07-06 21:11:32 +02:00
/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
class ALIGN16 FourVectors
{
public :
fltx4 x , y , z ;
2022-07-08 00:55:01 +02:00
FourVectors ( void )
{
}
FourVectors ( FourVectors const & src )
{
x = src . x ;
y = src . y ;
z = src . z ;
}
explicit FORCEINLINE FourVectors ( float a )
{
fltx4 aReplicated = ReplicateX4 ( a ) ;
x = y = z = aReplicated ;
}
FORCEINLINE void Init ( void )
{
x = Four_Zeros ;
y = Four_Zeros ;
z = Four_Zeros ;
}
FORCEINLINE void Init ( float flX , float flY , float flZ )
{
x = ReplicateX4 ( flX ) ;
y = ReplicateX4 ( flY ) ;
z = ReplicateX4 ( flZ ) ;
}
FORCEINLINE FourVectors ( float flX , float flY , float flZ )
{
Init ( flX , flY , flZ ) ;
}
FORCEINLINE void Init ( fltx4 const & fl4X , fltx4 const & fl4Y , fltx4 const & fl4Z )
{
x = fl4X ;
y = fl4Y ;
z = fl4Z ;
}
FORCEINLINE FourVectors ( fltx4 const & fl4X , fltx4 const & fl4Y , fltx4 const & fl4Z )
{
Init ( fl4X , fl4Y , fl4Z ) ;
}
/// construct a FourVectors from 4 separate Vectors
FORCEINLINE FourVectors ( Vector3D const & a , Vector3D const & b , Vector3D const & c , Vector3D const & d )
{
LoadAndSwizzle ( a , b , c , d ) ;
}
/// construct a FourVectors from 4 separate Vectors
FORCEINLINE FourVectors ( VectorAligned const & a , VectorAligned const & b , VectorAligned const & c , VectorAligned const & d )
{
LoadAndSwizzleAligned ( a , b , c , d ) ;
}
// construct from twelve floats; really only useful for static const constructors.
// input arrays must be aligned, and in the fourvectors' native format
// (eg in xxxx,yyyy,zzzz form)
// each pointer should be to an aligned array of four floats
FORCEINLINE FourVectors ( const float * xs , const float * ys , const float * zs ) :
x ( LoadAlignedSIMD ( xs ) ) , y ( LoadAlignedSIMD ( ys ) ) , z ( LoadAlignedSIMD ( zs ) )
{ } ;
2022-07-06 21:11:32 +02:00
FORCEINLINE void DuplicateVector ( Vector3D const & v ) //< set all 4 vectors to the same vector value
{
x = ReplicateX4 ( v . x ) ;
y = ReplicateX4 ( v . y ) ;
z = ReplicateX4 ( v . z ) ;
}
FORCEINLINE fltx4 const & operator [ ] ( int idx ) const
{
return * ( ( & x ) + idx ) ;
}
FORCEINLINE fltx4 & operator [ ] ( int idx )
{
return * ( ( & x ) + idx ) ;
}
FORCEINLINE void operator + = ( FourVectors const & b ) //< add 4 vectors to another 4 vectors
{
x = AddSIMD ( x , b . x ) ;
y = AddSIMD ( y , b . y ) ;
z = AddSIMD ( z , b . z ) ;
}
FORCEINLINE void operator - = ( FourVectors const & b ) //< subtract 4 vectors from another 4
{
x = SubSIMD ( x , b . x ) ;
y = SubSIMD ( y , b . y ) ;
z = SubSIMD ( z , b . z ) ;
}
FORCEINLINE void operator * = ( FourVectors const & b ) //< scale all four vectors per component scale
{
x = MulSIMD ( x , b . x ) ;
y = MulSIMD ( y , b . y ) ;
z = MulSIMD ( z , b . z ) ;
}
FORCEINLINE void operator * = ( const fltx4 & scale ) //< scale
{
x = MulSIMD ( x , scale ) ;
y = MulSIMD ( y , scale ) ;
z = MulSIMD ( z , scale ) ;
}
FORCEINLINE void operator * = ( float scale ) //< uniformly scale all 4 vectors
{
fltx4 scalepacked = ReplicateX4 ( scale ) ;
* this * = scalepacked ;
}
FORCEINLINE fltx4 operator * ( FourVectors const & b ) const //< 4 dot products
{
fltx4 dot = MulSIMD ( x , b . x ) ;
dot = MaddSIMD ( y , b . y , dot ) ;
dot = MaddSIMD ( z , b . z , dot ) ;
return dot ;
}
FORCEINLINE fltx4 operator * ( Vector3D const & b ) const //< dot product all 4 vectors with 1 vector
{
fltx4 dot = MulSIMD ( x , ReplicateX4 ( b . x ) ) ;
dot = MaddSIMD ( y , ReplicateX4 ( b . y ) , dot ) ;
dot = MaddSIMD ( z , ReplicateX4 ( b . z ) , dot ) ;
return dot ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE FourVectors operator * ( float b ) const //< scale
{
fltx4 scalepacked = ReplicateX4 ( b ) ;
FourVectors res ;
res . x = MulSIMD ( x , scalepacked ) ;
res . y = MulSIMD ( y , scalepacked ) ;
res . z = MulSIMD ( z , scalepacked ) ;
return res ;
}
FORCEINLINE FourVectors operator * ( FLTX4 fl4Scale ) const //< scale
{
FourVectors res ;
res . x = MulSIMD ( x , fl4Scale ) ;
res . y = MulSIMD ( y , fl4Scale ) ;
res . z = MulSIMD ( z , fl4Scale ) ;
return res ;
}
2022-07-06 21:11:32 +02:00
FORCEINLINE void VProduct ( FourVectors const & b ) //< component by component mul
{
x = MulSIMD ( x , b . x ) ;
y = MulSIMD ( y , b . y ) ;
z = MulSIMD ( z , b . z ) ;
}
FORCEINLINE void MakeReciprocal ( void ) //< (x,y,z)=(1/x,1/y,1/z)
{
x = ReciprocalSIMD ( x ) ;
y = ReciprocalSIMD ( y ) ;
z = ReciprocalSIMD ( z ) ;
}
FORCEINLINE void MakeReciprocalSaturate ( void ) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23
{
x = ReciprocalSaturateSIMD ( x ) ;
y = ReciprocalSaturateSIMD ( y ) ;
z = ReciprocalSaturateSIMD ( z ) ;
}
// Assume the given matrix is a rotation, and rotate these vectors by it.
// If you have a long list of FourVectors structures that you all want
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
inline void RotateBy ( const matrix3x4_t & matrix ) ;
2022-07-08 00:55:01 +02:00
/***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function:
// rotate these vectors ( in place ) by the corresponding quaternions:
inline void RotateBy ( const FourQuaternions & quats ) ;
* * * * * */
2022-07-06 21:11:32 +02:00
/// You can use this to rotate a long array of FourVectors all by the same
/// matrix. The first parameter is the head of the array. The second is the
/// number of vectors to rotate. The third is the matrix.
static void RotateManyBy ( FourVectors * RESTRICT pVectors , unsigned int numVectors , const matrix3x4_t & rotationMatrix ) ;
2022-07-08 00:55:01 +02:00
static void RotateManyBy ( FourVectors * RESTRICT pVectors , unsigned int numVectors , const matrix3x4_t & rotationMatrix , FourVectors * RESTRICT pOut ) ;
2022-07-06 21:11:32 +02:00
/// Assume the vectors are points, and transform them in place by the matrix.
inline void TransformBy ( const matrix3x4_t & matrix ) ;
/// You can use this to Transform a long array of FourVectors all by the same
/// matrix. The first parameter is the head of the array. The second is the
/// number of vectors to rotate. The third is the matrix. The fourth is the
/// output buffer, which must not overlap the pVectors buffer. This is not
/// an in-place transformation.
static void TransformManyBy ( FourVectors * RESTRICT pVectors , unsigned int numVectors , const matrix3x4_t & rotationMatrix , FourVectors * RESTRICT pOut ) ;
/// You can use this to Transform a long array of FourVectors all by the same
/// matrix. The first parameter is the head of the array. The second is the
/// number of vectors to rotate. The third is the matrix. The fourth is the
/// output buffer, which must not overlap the pVectors buffer.
/// This is an in-place transformation.
static void TransformManyBy ( FourVectors * RESTRICT pVectors , unsigned int numVectors , const matrix3x4_t & rotationMatrix ) ;
2022-07-08 00:55:01 +02:00
static void CalcClosestPointOnLineSIMD ( const FourVectors & P , const FourVectors & vLineA , const FourVectors & vLineB , FourVectors & vClosest , fltx4 * outT = 0 ) ;
static fltx4 CalcClosestPointToLineTSIMD ( const FourVectors & P , const FourVectors & vLineA , const FourVectors & vLineB , FourVectors & vDir ) ;
2022-07-06 21:11:32 +02:00
// X(),Y(),Z() - get at the desired component of the i'th (0..3) vector.
FORCEINLINE const float & X ( int idx ) const
{
// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
return SubFloat ( ( fltx4 & ) x , idx ) ;
}
FORCEINLINE const float & Y ( int idx ) const
{
return SubFloat ( ( fltx4 & ) y , idx ) ;
}
FORCEINLINE const float & Z ( int idx ) const
{
return SubFloat ( ( fltx4 & ) z , idx ) ;
}
FORCEINLINE float & X ( int idx )
{
return SubFloat ( x , idx ) ;
}
FORCEINLINE float & Y ( int idx )
{
return SubFloat ( y , idx ) ;
}
FORCEINLINE float & Z ( int idx )
{
return SubFloat ( z , idx ) ;
}
FORCEINLINE Vector3D Vec ( int idx ) const //< unpack one of the vectors
{
return Vector3D ( X ( idx ) , Y ( idx ) , Z ( idx ) ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE void operator = ( FourVectors const & src )
2022-07-06 21:11:32 +02:00
{
x = src . x ;
y = src . y ;
z = src . z ;
}
2022-07-08 00:55:01 +02:00
/// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op
FORCEINLINE void LoadAndSwizzle ( Vector3D const & a , Vector3D const & b , Vector3D const & c , Vector3D const & d )
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
// TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
// use an unfolded implementation here
# if defined( _X360 ) || defined(_PS3)
fltx4 tx = LoadUnalignedSIMD ( & a . x ) ;
fltx4 ty = LoadUnalignedSIMD ( & b . x ) ;
fltx4 tz = LoadUnalignedSIMD ( & c . x ) ;
fltx4 tw = LoadUnalignedSIMD ( & d . x ) ;
fltx4 r0 = VectorMergeHighSIMD ( tx , tz ) ;
fltx4 r1 = VectorMergeHighSIMD ( ty , tw ) ;
fltx4 r2 = VectorMergeLowSIMD ( tx , tz ) ;
fltx4 r3 = VectorMergeLowSIMD ( ty , tw ) ;
x = VectorMergeHighSIMD ( r0 , r1 ) ;
y = VectorMergeLowSIMD ( r0 , r1 ) ;
z = VectorMergeHighSIMD ( r2 , r3 ) ;
# else
x = LoadUnalignedSIMD ( & ( a . x ) ) ;
y = LoadUnalignedSIMD ( & ( b . x ) ) ;
z = LoadUnalignedSIMD ( & ( c . x ) ) ;
fltx4 w = LoadUnalignedSIMD ( & ( d . x ) ) ;
// now, matrix is:
// x y z ?
// x y z ?
// x y z ?
// x y z ?
TransposeSIMD ( x , y , z , w ) ;
# endif
}
FORCEINLINE void LoadAndSwizzle ( Vector3D const & a )
{
LoadAndSwizzle ( a , a , a , a ) ;
}
// Broadcasts a, b, c, and d into the four vectors
// This is only performant if the floats are ALREADY IN MEMORY
// and not on registers -- eg,
// .Load( &fltArrray[0], &fltArrray[1], &fltArrray[2], &fltArrray[3] ) is okay,
// .Load( fltArrray[0] * 0.5f, fltArrray[1] * 0.5f, fltArrray[2] * 0.5f, fltArrray[3] * 0.5f ) is not.
FORCEINLINE void Load ( const float & a , const float & b , const float & c , const float & d )
{
# if defined( _X360 ) || defined( _PS3 )
fltx4 temp [ 4 ] ;
temp [ 0 ] = LoadUnalignedFloatSIMD ( & a ) ;
temp [ 1 ] = LoadUnalignedFloatSIMD ( & b ) ;
temp [ 2 ] = LoadUnalignedFloatSIMD ( & c ) ;
temp [ 3 ] = LoadUnalignedFloatSIMD ( & d ) ;
y = VectorMergeHighSIMD ( temp [ 0 ] , temp [ 2 ] ) ; // ac__
z = VectorMergeHighSIMD ( temp [ 1 ] , temp [ 3 ] ) ; // bd__
x = VectorMergeHighSIMD ( y , z ) ; // abcd
y = x ;
z = x ;
# else
ALIGN16 float temp [ 4 ] ;
temp [ 0 ] = a ; temp [ 1 ] = b ; temp [ 2 ] = c ; temp [ 3 ] = d ;
fltx4 v = LoadAlignedSIMD ( temp ) ;
x = v ;
y = v ;
z = v ;
# endif
2022-07-06 21:11:32 +02:00
}
2022-07-08 00:55:01 +02:00
// transform four horizontal vectors into the internal vertical ones
FORCEINLINE void LoadAndSwizzle ( FLTX4 a , FLTX4 b , FLTX4 c , FLTX4 d )
2022-07-06 21:11:32 +02:00
{
2022-07-08 00:55:01 +02:00
# if defined( _X360 ) || defined( _PS3 )
fltx4 tx = a ;
fltx4 ty = b ;
fltx4 tz = c ;
fltx4 tw = d ;
fltx4 r0 = VectorMergeHighSIMD ( tx , tz ) ;
fltx4 r1 = VectorMergeHighSIMD ( ty , tw ) ;
fltx4 r2 = VectorMergeLowSIMD ( tx , tz ) ;
fltx4 r3 = VectorMergeLowSIMD ( ty , tw ) ;
x = VectorMergeHighSIMD ( r0 , r1 ) ;
y = VectorMergeLowSIMD ( r0 , r1 ) ;
z = VectorMergeHighSIMD ( r2 , r3 ) ;
2022-07-06 21:11:32 +02:00
# else
2022-07-08 00:55:01 +02:00
x = a ;
y = b ;
z = c ;
fltx4 w = d ;
2022-07-06 21:11:32 +02:00
// now, matrix is:
// x y z ?
// x y z ?
// x y z ?
// x y z ?
TransposeSIMD ( x , y , z , w ) ;
# endif
}
/// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op.
/// all 4 vectors must be 128 bit boundary
FORCEINLINE void LoadAndSwizzleAligned ( const float * RESTRICT a , const float * RESTRICT b , const float * RESTRICT c , const float * RESTRICT d )
{
2022-07-08 00:55:01 +02:00
# if defined( _X360 ) || defined( _PS3 )
2022-07-06 21:11:32 +02:00
fltx4 tx = LoadAlignedSIMD ( a ) ;
fltx4 ty = LoadAlignedSIMD ( b ) ;
fltx4 tz = LoadAlignedSIMD ( c ) ;
fltx4 tw = LoadAlignedSIMD ( d ) ;
2022-07-08 00:55:01 +02:00
fltx4 r0 = VectorMergeHighSIMD ( tx , tz ) ;
fltx4 r1 = VectorMergeHighSIMD ( ty , tw ) ;
fltx4 r2 = VectorMergeLowSIMD ( tx , tz ) ;
fltx4 r3 = VectorMergeLowSIMD ( ty , tw ) ;
x = VectorMergeHighSIMD ( r0 , r1 ) ;
y = VectorMergeLowSIMD ( r0 , r1 ) ;
z = VectorMergeHighSIMD ( r2 , r3 ) ;
2022-07-06 21:11:32 +02:00
# else
x = LoadAlignedSIMD ( a ) ;
y = LoadAlignedSIMD ( b ) ;
z = LoadAlignedSIMD ( c ) ;
fltx4 w = LoadAlignedSIMD ( d ) ;
// now, matrix is:
// x y z ?
// x y z ?
// x y z ?
// x y z ?
TransposeSIMD ( x , y , z , w ) ;
# endif
}
FORCEINLINE void LoadAndSwizzleAligned ( Vector3D const & a , Vector3D const & b , Vector3D const & c , Vector3D const & d )
{
LoadAndSwizzleAligned ( & a . x , & b . x , & c . x , & d . x ) ;
}
2022-07-08 00:55:01 +02:00
/// Unpack a FourVectors back into four horizontal fltx4s.
/// Since the FourVectors doesn't store a w row, you can optionally
/// specify your own; otherwise it will be 0.
/// This function ABSOLUTELY MUST be inlined or the reference parameters will
/// induce a severe load-hit-store.
FORCEINLINE void TransposeOnto ( fltx4 & out0 , fltx4 & out1 , fltx4 & out2 , fltx4 & out3 , FLTX4 w = Four_Zeros ) const
{
// TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
// use an unfolded implementation here
# if defined( _X360 ) || defined(_PS3)
fltx4 r0 = VectorMergeHighSIMD ( x , z ) ;
fltx4 r1 = VectorMergeHighSIMD ( y , w ) ;
fltx4 r2 = VectorMergeLowSIMD ( x , z ) ;
fltx4 r3 = VectorMergeLowSIMD ( y , w ) ;
out0 = VectorMergeHighSIMD ( r0 , r1 ) ;
out1 = VectorMergeLowSIMD ( r0 , r1 ) ;
out2 = VectorMergeHighSIMD ( r2 , r3 ) ;
out3 = VectorMergeLowSIMD ( r2 , r3 ) ;
# else
out0 = x ;
out1 = y ;
out2 = z ;
out3 = w ;
TransposeSIMD ( out0 , out1 , out2 , out3 ) ;
# endif
}
# if !defined(__SPU__)
2023-01-31 14:31:17 +01:00
/// Store a FourVectors into four NON-CONTIGUOUS Vector3D*'s.
2022-07-08 00:55:01 +02:00
FORCEINLINE void StoreUnalignedVector3SIMD ( Vector3D * RESTRICT out0 , Vector3D * RESTRICT out1 , Vector3D * RESTRICT out2 , Vector3D * RESTRICT out3 ) const ;
# endif
/// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s.
FORCEINLINE void StoreAlignedVectorSIMD ( VectorAligned * RESTRICT out0 , VectorAligned * RESTRICT out1 , VectorAligned * RESTRICT out2 , VectorAligned * RESTRICT out3 ) const ;
# if !defined(__SPU__)
/// Store a FourVectors into four CONSECUTIVE Vectors in memory,
/// where the first vector IS NOT aligned on a 16-byte boundary.
FORCEINLINE void StoreUnalignedContigVector3SIMD ( Vector3D * RESTRICT pDestination )
{
fltx4 a , b , c , d ;
TransposeOnto ( a , b , c , d ) ;
StoreFourUnalignedVector3SIMD ( a , b , c , d , pDestination ) ;
}
# endif
/// Store a FourVectors into four CONSECUTIVE Vectors in memory,
/// where the first vector IS aligned on a 16-byte boundary.
/// (since four Vector3s = 48 bytes, groups of four can be said
/// to be 16-byte aligned though obviously the 2nd, 3d, and 4th
/// vectors in the group individually are not)
# if !defined(__SPU__)
FORCEINLINE void StoreAlignedContigVector3SIMD ( Vector3D * RESTRICT pDestination )
{
fltx4 a , b , c , d ;
TransposeOnto ( a , b , c , d ) ;
StoreFourAlignedVector3SIMD ( a , b , c , d , pDestination ) ;
}
/// Store a FourVectors into four CONSECUTIVE VectorAligneds in memory
FORCEINLINE void StoreAlignedContigVectorASIMD ( VectorAligned * RESTRICT pDestination )
{
StoreAlignedVectorSIMD ( pDestination , pDestination + 1 , pDestination + 2 , pDestination + 3 ) ;
}
# endif
/// return the squared length of all 4 vectors, the same name as used on Vector
FORCEINLINE fltx4 LengthSqr ( void ) const
{
const FourVectors & a = * this ;
return a * a ;
}
2022-07-06 21:11:32 +02:00
/// return the squared length of all 4 vectors
FORCEINLINE fltx4 length2 ( void ) const
{
return ( * this ) * ( * this ) ;
}
/// return the approximate length of all 4 vectors. uses the sqrt approximation instruction
FORCEINLINE fltx4 length ( void ) const
{
return SqrtEstSIMD ( length2 ( ) ) ;
}
2022-07-08 00:55:01 +02:00
/// full precision square root. upper/lower case name is an artifact - the lower case one should be changed to refelct the lower accuracy. I added the mixed case one for compat with Vector
FORCEINLINE fltx4 Length ( void ) const
{
return SqrtSIMD ( length2 ( ) ) ;
}
2022-07-06 21:11:32 +02:00
/// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
FORCEINLINE void VectorNormalizeFast ( void )
{
fltx4 mag_sq = ( * this ) * ( * this ) ; // length^2
( * this ) * = ReciprocalSqrtEstSIMD ( mag_sq ) ; // *(1.0/sqrt(length^2))
}
/// normalize all 4 vectors in place.
FORCEINLINE void VectorNormalize ( void )
{
fltx4 mag_sq = ( * this ) * ( * this ) ; // length^2
( * this ) * = ReciprocalSqrtSIMD ( mag_sq ) ; // *(1.0/sqrt(length^2))
}
FORCEINLINE fltx4 DistToSqr ( FourVectors const & pnt )
{
fltx4 fl4dX = SubSIMD ( pnt . x , x ) ;
fltx4 fl4dY = SubSIMD ( pnt . y , y ) ;
fltx4 fl4dZ = SubSIMD ( pnt . z , z ) ;
return AddSIMD ( MulSIMD ( fl4dX , fl4dX ) , AddSIMD ( MulSIMD ( fl4dY , fl4dY ) , MulSIMD ( fl4dZ , fl4dZ ) ) ) ;
}
FORCEINLINE fltx4 TValueOfClosestPointOnLine ( FourVectors const & p0 , FourVectors const & p1 ) const
{
FourVectors lineDelta = p1 ;
lineDelta - = p0 ;
fltx4 OOlineDirDotlineDir = ReciprocalSIMD ( p1 * p1 ) ;
FourVectors v4OurPnt = * this ;
v4OurPnt - = p0 ;
return MulSIMD ( OOlineDirDotlineDir , v4OurPnt * lineDelta ) ;
}
FORCEINLINE fltx4 DistSqrToLineSegment ( FourVectors const & p0 , FourVectors const & p1 ) const
{
FourVectors lineDelta = p1 ;
FourVectors v4OurPnt = * this ;
v4OurPnt - = p0 ;
lineDelta - = p0 ;
fltx4 OOlineDirDotlineDir = ReciprocalSIMD ( lineDelta * lineDelta ) ;
fltx4 fl4T = MulSIMD ( OOlineDirDotlineDir , v4OurPnt * lineDelta ) ;
fl4T = MinSIMD ( fl4T , Four_Ones ) ;
fl4T = MaxSIMD ( fl4T , Four_Zeros ) ;
lineDelta * = fl4T ;
return v4OurPnt . DistToSqr ( lineDelta ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE FourVectors Normalized ( ) const
{
fltx4 fl4LengthInv = ReciprocalSqrtSIMD ( LengthSqr ( ) ) ;
FourVectors out ;
out . x = x * fl4LengthInv ;
out . y = y * fl4LengthInv ;
out . z = z * fl4LengthInv ;
return out ;
}
FORCEINLINE FourVectors NormalizedSafeX ( ) const
{
fltx4 f4LenSqr = LengthSqr ( ) ;
fltx4 isBigEnough = CmpGeSIMD ( f4LenSqr , Four_Epsilons ) ;
fltx4 fl4LengthInv = ReciprocalSqrtSIMD ( f4LenSqr ) ;
FourVectors out ;
out . x = MaskedAssign ( isBigEnough , x * fl4LengthInv , Four_Ones ) ;
out . y = AndSIMD ( y * fl4LengthInv , isBigEnough ) ;
out . z = AndSIMD ( z * fl4LengthInv , isBigEnough ) ;
return out ;
}
FORCEINLINE FourVectors NormalizedSafeY ( ) const
{
fltx4 f4LenSqr = LengthSqr ( ) ;
fltx4 isBigEnough = CmpGeSIMD ( f4LenSqr , Four_Epsilons ) ;
fltx4 fl4LengthInv = ReciprocalSqrtSIMD ( f4LenSqr ) ;
FourVectors out ;
out . x = AndSIMD ( x * fl4LengthInv , isBigEnough ) ;
out . y = MaskedAssign ( isBigEnough , y * fl4LengthInv , Four_Ones ) ;
out . z = AndSIMD ( z * fl4LengthInv , isBigEnough ) ;
return out ;
}
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
FORCEINLINE FourVectors NormalizedSafeZ ( ) const
{
fltx4 f4LenSqr = LengthSqr ( ) ;
fltx4 isBigEnough = CmpGeSIMD ( f4LenSqr , Four_Epsilons ) ;
fltx4 fl4LengthInv = ReciprocalSqrtSIMD ( f4LenSqr ) ;
FourVectors out ;
out . x = AndSIMD ( x * fl4LengthInv , isBigEnough ) ;
out . y = AndSIMD ( y * fl4LengthInv , isBigEnough ) ;
out . z = MaskedAssign ( isBigEnough , z * fl4LengthInv , Four_Ones ) ;
return out ;
}
2022-07-06 21:11:32 +02:00
} ;
2022-07-08 00:55:01 +02:00
inline FourVectors CrossProduct ( const FourVectors & a , const FourVectors & b )
{
return FourVectors ( a . y * b . z - a . z * b . y , a . z * b . x - a . x * b . z , a . x * b . y - a . y * b . x ) ;
}
inline fltx4 DotProduct ( const FourVectors & a , const FourVectors & b )
{
return a . x * b . x + a . y * b . y + a . z * b . z ;
}
inline FourVectors operator * ( fltx4 left , const FourVectors & right )
{
return right * left ;
}
//
inline FourVectors Mul ( const FourVectors & a , const fltx4 & b )
{
FourVectors ret ;
ret . x = MulSIMD ( a . x , b ) ;
ret . y = MulSIMD ( a . y , b ) ;
ret . z = MulSIMD ( a . z , b ) ;
return ret ;
}
inline FourVectors Mul ( const FourVectors & a , const FourVectors & b )
{
FourVectors ret ;
ret . x = MulSIMD ( a . x , b . x ) ;
ret . y = MulSIMD ( a . y , b . y ) ;
ret . z = MulSIMD ( a . z , b . z ) ;
return ret ;
}
inline FourVectors Madd ( const FourVectors & a , const fltx4 & b , const FourVectors & c ) // a*b + c
{
FourVectors ret ;
ret . x = MaddSIMD ( a . x , b , c . x ) ;
ret . y = MaddSIMD ( a . y , b , c . y ) ;
ret . z = MaddSIMD ( a . z , b , c . z ) ;
return ret ;
}
2022-07-06 21:11:32 +02:00
/// form 4 cross products
inline FourVectors operator ^ ( const FourVectors & a , const FourVectors & b )
{
FourVectors ret ;
ret . x = SubSIMD ( MulSIMD ( a . y , b . z ) , MulSIMD ( a . z , b . y ) ) ;
ret . y = SubSIMD ( MulSIMD ( a . z , b . x ) , MulSIMD ( a . x , b . z ) ) ;
ret . z = SubSIMD ( MulSIMD ( a . x , b . y ) , MulSIMD ( a . y , b . x ) ) ;
return ret ;
}
2022-07-08 00:55:01 +02:00
inline FourVectors operator - ( const FourVectors & a , const FourVectors & b )
{
FourVectors ret ;
ret . x = SubSIMD ( a . x , b . x ) ;
ret . y = SubSIMD ( a . y , b . y ) ;
ret . z = SubSIMD ( a . z , b . z ) ;
return ret ;
}
inline FourVectors operator + ( const FourVectors & a , const FourVectors & b )
{
FourVectors ret ;
ret . x = AddSIMD ( a . x , b . x ) ;
ret . y = AddSIMD ( a . y , b . y ) ;
ret . z = AddSIMD ( a . z , b . z ) ;
return ret ;
}
2022-07-06 21:11:32 +02:00
/// component-by-componentwise MAX operator
inline FourVectors maximum ( const FourVectors & a , const FourVectors & b )
{
FourVectors ret ;
ret . x = MaxSIMD ( a . x , b . x ) ;
ret . y = MaxSIMD ( a . y , b . y ) ;
ret . z = MaxSIMD ( a . z , b . z ) ;
return ret ;
}
/// component-by-componentwise MIN operator
inline FourVectors minimum ( const FourVectors & a , const FourVectors & b )
{
FourVectors ret ;
ret . x = MinSIMD ( a . x , b . x ) ;
ret . y = MinSIMD ( a . y , b . y ) ;
ret . z = MinSIMD ( a . z , b . z ) ;
return ret ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE FourVectors RotateLeft ( const FourVectors & src )
{
2023-04-01 22:37:12 +02:00
FourVectors ret ;
ret . x = RotateLeft ( src . x ) ;
ret . y = RotateLeft ( src . y ) ;
ret . z = RotateLeft ( src . z ) ;
2023-04-08 19:01:35 +02:00
return ret ;
2022-07-08 00:55:01 +02:00
}
FORCEINLINE FourVectors RotateRight ( const FourVectors & src )
{
2023-04-01 22:37:12 +02:00
FourVectors ret ;
ret . x = RotateRight ( src . x ) ;
ret . y = RotateRight ( src . y ) ;
ret . z = RotateRight ( src . z ) ;
2023-04-08 19:01:35 +02:00
return ret ;
2022-07-08 00:55:01 +02:00
}
FORCEINLINE FourVectors MaskedAssign ( const bi32x4 & ReplacementMask , const FourVectors & NewValue , const FourVectors & OldValue )
{
FourVectors ret ;
ret . x = MaskedAssign ( ReplacementMask , NewValue . x , OldValue . x ) ;
ret . y = MaskedAssign ( ReplacementMask , NewValue . y , OldValue . y ) ;
ret . z = MaskedAssign ( ReplacementMask , NewValue . z , OldValue . z ) ;
return ret ;
}
2022-07-06 21:11:32 +02:00
/// calculate reflection vector. incident and normal dir assumed normalized
FORCEINLINE FourVectors VectorReflect ( const FourVectors & incident , const FourVectors & normal )
{
FourVectors ret = incident ;
fltx4 iDotNx2 = incident * normal ;
iDotNx2 = AddSIMD ( iDotNx2 , iDotNx2 ) ;
FourVectors nPart = normal ;
nPart * = iDotNx2 ;
ret - = nPart ; // i-2(n*i)n
return ret ;
}
/// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector.
FORCEINLINE FourVectors VectorSlide ( const FourVectors & incident , const FourVectors & normal )
{
FourVectors ret = incident ;
fltx4 iDotN = incident * normal ;
FourVectors nPart = normal ;
nPart * = iDotN ;
ret - = nPart ; // i-(n*i)n
return ret ;
}
2022-07-08 00:55:01 +02:00
/// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
FORCEINLINE FourVectors VectorNormalizeFast ( const FourVectors & src )
{
fltx4 mag_sq = ReciprocalSqrtEstSIMD ( src * src ) ; // *(1.0/sqrt(length^2))
FourVectors result ;
result . x = MulSIMD ( src . x , mag_sq ) ;
result . y = MulSIMD ( src . y , mag_sq ) ;
result . z = MulSIMD ( src . z , mag_sq ) ;
return result ;
}
# if !defined(__SPU__)
2023-01-31 14:31:17 +01:00
/// Store a FourVectors into four NON-CONTIGUOUS Vector3D*'s.
2022-07-08 00:55:01 +02:00
FORCEINLINE void FourVectors : : StoreUnalignedVector3SIMD ( Vector3D * RESTRICT out0 , Vector3D * RESTRICT out1 , Vector3D * RESTRICT out2 , Vector3D * RESTRICT out3 ) const
{
# ifdef _X360
fltx4 x0 , x1 , x2 , x3 , y0 , y1 , y2 , y3 , z0 , z1 , z2 , z3 ;
x0 = SplatXSIMD ( x ) ; // all x0x0x0x0
x1 = SplatYSIMD ( x ) ;
x2 = SplatZSIMD ( x ) ;
x3 = SplatWSIMD ( x ) ;
y0 = SplatXSIMD ( y ) ;
y1 = SplatYSIMD ( y ) ;
y2 = SplatZSIMD ( y ) ;
y3 = SplatWSIMD ( y ) ;
z0 = SplatXSIMD ( z ) ;
z1 = SplatYSIMD ( z ) ;
z2 = SplatZSIMD ( z ) ;
z3 = SplatWSIMD ( z ) ;
__stvewx ( x0 , out0 - > Base ( ) , 0 ) ; // store X word
__stvewx ( y0 , out0 - > Base ( ) , 4 ) ; // store Y word
__stvewx ( z0 , out0 - > Base ( ) , 8 ) ; // store Z word
__stvewx ( x1 , out1 - > Base ( ) , 0 ) ; // store X word
__stvewx ( y1 , out1 - > Base ( ) , 4 ) ; // store Y word
__stvewx ( z1 , out1 - > Base ( ) , 8 ) ; // store Z word
__stvewx ( x2 , out2 - > Base ( ) , 0 ) ; // store X word
__stvewx ( y2 , out2 - > Base ( ) , 4 ) ; // store Y word
__stvewx ( z2 , out2 - > Base ( ) , 8 ) ; // store Z word
__stvewx ( x3 , out3 - > Base ( ) , 0 ) ; // store X word
__stvewx ( y3 , out3 - > Base ( ) , 4 ) ; // store Y word
__stvewx ( z3 , out3 - > Base ( ) , 8 ) ; // store Z word
# else
fltx4 a , b , c , d ;
TransposeOnto ( a , b , c , d ) ;
StoreUnaligned3SIMD ( out0 - > Base ( ) , a ) ;
StoreUnaligned3SIMD ( out1 - > Base ( ) , b ) ;
StoreUnaligned3SIMD ( out2 - > Base ( ) , c ) ;
StoreUnaligned3SIMD ( out3 - > Base ( ) , d ) ;
# endif
}
/// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s.
FORCEINLINE void FourVectors : : StoreAlignedVectorSIMD ( VectorAligned * RESTRICT out0 , VectorAligned * RESTRICT out1 , VectorAligned * RESTRICT out2 , VectorAligned * RESTRICT out3 ) const
{
fltx4 a , b , c , d ;
TransposeOnto ( a , b , c , d ) ;
StoreAligned3SIMD ( out0 , a ) ;
StoreAligned3SIMD ( out1 , b ) ;
StoreAligned3SIMD ( out2 , c ) ;
StoreAligned3SIMD ( out3 , d ) ;
}
# endif
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
# if !defined(__SPU__)
2022-07-06 21:11:32 +02:00
// Assume the given matrix is a rotation, and rotate these vectors by it.
// If you have a long list of FourVectors structures that you all want
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
void FourVectors : : RotateBy ( const matrix3x4_t & matrix )
{
// Splat out each of the entries in the matrix to a fltx4. Do this
// in the order that we will need them, to hide latency. I'm
// avoiding making an array of them, so that they'll remain in
// registers.
fltx4 matSplat00 , matSplat01 , matSplat02 ,
matSplat10 , matSplat11 , matSplat12 ,
matSplat20 , matSplat21 , matSplat22 ;
2022-07-08 00:55:01 +02:00
// Load the matrix into local vectors. Sadly, matrix3x4_ts are
// often unaligned. The w components will be the tranpose row of
// the matrix, but we don't really care about that.
fltx4 matCol0 = LoadUnalignedSIMD ( matrix [ 0 ] ) ;
fltx4 matCol1 = LoadUnalignedSIMD ( matrix [ 1 ] ) ;
fltx4 matCol2 = LoadUnalignedSIMD ( matrix [ 2 ] ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
matSplat00 = SplatXSIMD ( matCol0 ) ;
matSplat01 = SplatYSIMD ( matCol0 ) ;
matSplat02 = SplatZSIMD ( matCol0 ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
matSplat10 = SplatXSIMD ( matCol1 ) ;
matSplat11 = SplatYSIMD ( matCol1 ) ;
matSplat12 = SplatZSIMD ( matCol1 ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
matSplat20 = SplatXSIMD ( matCol2 ) ;
matSplat21 = SplatYSIMD ( matCol2 ) ;
matSplat22 = SplatZSIMD ( matCol2 ) ;
2022-07-06 21:11:32 +02:00
// Trust in the compiler to schedule these operations correctly:
fltx4 outX , outY , outZ ;
outX = AddSIMD ( AddSIMD ( MulSIMD ( x , matSplat00 ) , MulSIMD ( y , matSplat01 ) ) , MulSIMD ( z , matSplat02 ) ) ;
outY = AddSIMD ( AddSIMD ( MulSIMD ( x , matSplat10 ) , MulSIMD ( y , matSplat11 ) ) , MulSIMD ( z , matSplat12 ) ) ;
outZ = AddSIMD ( AddSIMD ( MulSIMD ( x , matSplat20 ) , MulSIMD ( y , matSplat21 ) ) , MulSIMD ( z , matSplat22 ) ) ;
x = outX ;
y = outY ;
z = outZ ;
}
2022-07-08 00:55:01 +02:00
2022-07-06 21:11:32 +02:00
// Assume the given matrix is a rotation, and rotate these vectors by it.
// If you have a long list of FourVectors structures that you all want
// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
void FourVectors : : TransformBy ( const matrix3x4_t & matrix )
{
// Splat out each of the entries in the matrix to a fltx4. Do this
// in the order that we will need them, to hide latency. I'm
// avoiding making an array of them, so that they'll remain in
// registers.
fltx4 matSplat00 , matSplat01 , matSplat02 ,
matSplat10 , matSplat11 , matSplat12 ,
matSplat20 , matSplat21 , matSplat22 ;
2022-07-08 00:55:01 +02:00
// Load the matrix into local vectors. Sadly, matrix3x4_ts are
// often unaligned. The w components will be the tranpose row of
// the matrix, but we don't really care about that.
fltx4 matCol0 = LoadUnalignedSIMD ( matrix [ 0 ] ) ;
fltx4 matCol1 = LoadUnalignedSIMD ( matrix [ 1 ] ) ;
fltx4 matCol2 = LoadUnalignedSIMD ( matrix [ 2 ] ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
matSplat00 = SplatXSIMD ( matCol0 ) ;
matSplat01 = SplatYSIMD ( matCol0 ) ;
matSplat02 = SplatZSIMD ( matCol0 ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
matSplat10 = SplatXSIMD ( matCol1 ) ;
matSplat11 = SplatYSIMD ( matCol1 ) ;
matSplat12 = SplatZSIMD ( matCol1 ) ;
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
matSplat20 = SplatXSIMD ( matCol2 ) ;
matSplat21 = SplatYSIMD ( matCol2 ) ;
matSplat22 = SplatZSIMD ( matCol2 ) ;
2022-07-06 21:11:32 +02:00
// Trust in the compiler to schedule these operations correctly:
fltx4 outX , outY , outZ ;
outX = MaddSIMD ( z , matSplat02 , AddSIMD ( MulSIMD ( x , matSplat00 ) , MulSIMD ( y , matSplat01 ) ) ) ;
outY = MaddSIMD ( z , matSplat12 , AddSIMD ( MulSIMD ( x , matSplat10 ) , MulSIMD ( y , matSplat11 ) ) ) ;
outZ = MaddSIMD ( z , matSplat22 , AddSIMD ( MulSIMD ( x , matSplat20 ) , MulSIMD ( y , matSplat21 ) ) ) ;
x = AddSIMD ( outX , ReplicateX4 ( matrix [ 0 ] [ 3 ] ) ) ;
y = AddSIMD ( outY , ReplicateX4 ( matrix [ 1 ] [ 3 ] ) ) ;
z = AddSIMD ( outZ , ReplicateX4 ( matrix [ 2 ] [ 3 ] ) ) ;
}
2022-07-08 00:55:01 +02:00
# endif
2022-07-06 21:11:32 +02:00
fltx4 NoiseSIMD ( FourVectors const & v ) ;
// vector valued noise direction
FourVectors DNoiseSIMD ( FourVectors const & v ) ;
// vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html
FourVectors CurlNoiseSIMD ( FourVectors const & v ) ;
2022-07-08 00:55:01 +02:00
//#endif // !defined SPU
/// quick, low quality perlin-style noise() function suitable for real time use.
/// return value is -1..1. Only reliable around +/- 1 million or so.
fltx4 NoiseSIMD ( const fltx4 & x , const fltx4 & y , const fltx4 & z ) ;
2022-07-06 21:11:32 +02:00
/// calculate the absolute value of a packed single
inline fltx4 fabs ( const fltx4 & x )
{
return AndSIMD ( x , LoadAlignedSIMD ( g_SIMD_clear_signmask ) ) ;
}
2022-07-08 00:55:01 +02:00
// Convenience version
inline fltx4 AbsSIMD ( const fltx4 & x )
{
return fabs ( x ) ;
}
2022-07-06 21:11:32 +02:00
/// negate all four components of a SIMD packed single
inline fltx4 fnegate ( const fltx4 & x )
{
return XorSIMD ( x , LoadAlignedSIMD ( g_SIMD_signmask ) ) ;
}
fltx4 Pow_FixedPoint_Exponent_SIMD ( const fltx4 & x , int exponent ) ;
// PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some
2022-09-09 19:47:31 +02:00
// restrictions: fractional exponents are only handled with 2 bits of precision. Basically,
2022-07-06 21:11:32 +02:00
// fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25).
// negative and fractional powers are handled by the SIMD reciprocal and square root approximation
// instructions and so are not especially accurate ----Note that this routine does not raise
// numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)).
inline fltx4 PowSIMD ( const fltx4 & x , float exponent )
{
return Pow_FixedPoint_Exponent_SIMD ( x , ( int ) ( 4.0 * exponent ) ) ;
}
2022-07-08 00:55:01 +02:00
/// (x<1)?x^(1/2.2):1. Use a 4th order polynomial to approximate x^(1/2.2) over 0..1
inline fltx4 LinearToGammaSIMD ( fltx4 x )
{
// y = -3.7295x4 + 8.9635x3 - 7.7397x2 + 3.443x + 0.048
x = MaxSIMD ( MinSIMD ( Four_Ones , x ) , Four_Zeros ) ;
return AddSIMD ( Four_LinearToGammaCoefficients_E ,
MulSIMD ( x , AddSIMD ( Four_LinearToGammaCoefficients_D ,
MulSIMD ( x , AddSIMD ( Four_LinearToGammaCoefficients_C ,
MulSIMD ( x , AddSIMD ( Four_LinearToGammaCoefficients_B ,
MulSIMD ( x , Four_LinearToGammaCoefficients_A ) ) ) ) ) ) ) ) ;
}
inline fltx4 GammaToLinearSIMD ( fltx4 x )
{
x = MaxSIMD ( x , Four_Zeros ) ;
x = AddSIMD ( Four_GammaToLinearCoefficients_D ,
MulSIMD ( x , AddSIMD ( Four_GammaToLinearCoefficients_C ,
MulSIMD ( x , AddSIMD ( Four_GammaToLinearCoefficients_B ,
MulSIMD ( x , Four_GammaToLinearCoefficients_A ) ) ) ) ) ) ;
return MinSIMD ( x , Four_Ones ) ;
}
2022-07-06 21:11:32 +02:00
2022-07-08 00:55:01 +02:00
/// ( x > 1 ) ? x : x^2.2
inline fltx4 GammaToLinearExtendedSIMD ( fltx4 x )
{
x = MaxSIMD ( x , Four_Zeros ) ;
fltx4 fl4Ret = AddSIMD ( Four_GammaToLinearCoefficients_D ,
MulSIMD ( x , AddSIMD ( Four_GammaToLinearCoefficients_C ,
MulSIMD ( x , AddSIMD ( Four_GammaToLinearCoefficients_B ,
MulSIMD ( x , Four_GammaToLinearCoefficients_A ) ) ) ) ) ) ;
return MaskedAssign ( CmpGeSIMD ( x , Four_Ones ) , x , fl4Ret ) ;
}
2022-07-06 21:11:32 +02:00
// random number generation - generate 4 random numbers quickly.
void SeedRandSIMD ( uint32 seed ) ; // seed the random # generator
fltx4 RandSIMD ( int nContext = 0 ) ; // return 4 numbers in the 0..1 range
// for multithreaded, you need to use these and use the argument form of RandSIMD:
int GetSIMDRandContext ( void ) ;
void ReleaseSIMDRandContext ( int nContext ) ;
FORCEINLINE fltx4 RandSignedSIMD ( void ) // -1..1
{
return SubSIMD ( MulSIMD ( Four_Twos , RandSIMD ( ) ) , Four_Ones ) ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 LerpSIMD ( const fltx4 & percent , const fltx4 & a , const fltx4 & b )
{
return AddSIMD ( a , MulSIMD ( SubSIMD ( b , a ) , percent ) ) ;
}
FORCEINLINE fltx4 RemapValClampedSIMD ( const fltx4 & val , const fltx4 & a , const fltx4 & b , const fltx4 & c , const fltx4 & d ) // Remap val from clamped range between a and b to new range between c and d
{
fltx4 range = MaskedAssign ( CmpEqSIMD ( a , b ) , Four_Ones , SubSIMD ( b , a ) ) ; //make sure range > 0
fltx4 cVal = MaxSIMD ( Four_Zeros , MinSIMD ( Four_Ones , DivSIMD ( SubSIMD ( val , a ) , range ) ) ) ; //saturate
return LerpSIMD ( cVal , c , d ) ;
}
2022-07-06 21:11:32 +02:00
// SIMD versions of mathlib simplespline functions
// hermite basis function for smooth interpolation
// Similar to Gain() above, but very cheap to call
// value should be between 0 & 1 inclusive
inline fltx4 SimpleSpline ( const fltx4 & value )
{
// Arranged to avoid a data dependency between these two MULs:
fltx4 valueDoubled = MulSIMD ( value , Four_Twos ) ;
fltx4 valueSquared = MulSIMD ( value , value ) ;
// Nice little ease-in, ease-out spline-like curve
return SubSIMD (
MulSIMD ( Four_Threes , valueSquared ) ,
MulSIMD ( valueDoubled , valueSquared ) ) ;
}
// remaps a value in [startInterval, startInterval+rangeInterval] from linear to
// spline using SimpleSpline
inline fltx4 SimpleSplineRemapValWithDeltas ( const fltx4 & val ,
2023-04-02 17:11:14 +02:00
const fltx4 & A , /*const fltx4& BMinusA,*/
2022-07-06 21:11:32 +02:00
const fltx4 & OneOverBMinusA , const fltx4 & C ,
const fltx4 & DMinusC )
{
// if ( A == B )
// return val >= B ? D : C;
fltx4 cVal = MulSIMD ( SubSIMD ( val , A ) , OneOverBMinusA ) ;
return AddSIMD ( C , MulSIMD ( DMinusC , SimpleSpline ( cVal ) ) ) ;
}
inline fltx4 SimpleSplineRemapValWithDeltasClamped ( const fltx4 & val ,
2023-04-02 17:11:14 +02:00
const fltx4 & A , /*const fltx4& BMinusA,*/
2022-07-06 21:11:32 +02:00
const fltx4 & OneOverBMinusA , const fltx4 & C ,
const fltx4 & DMinusC )
{
// if ( A == B )
// return val >= B ? D : C;
fltx4 cVal = MulSIMD ( SubSIMD ( val , A ) , OneOverBMinusA ) ;
cVal = MinSIMD ( Four_Ones , MaxSIMD ( Four_Zeros , cVal ) ) ;
return AddSIMD ( C , MulSIMD ( DMinusC , SimpleSpline ( cVal ) ) ) ;
}
FORCEINLINE fltx4 FracSIMD ( const fltx4 & val )
{
fltx4 fl4Abs = fabs ( val ) ;
fltx4 ival = SubSIMD ( AddSIMD ( fl4Abs , Four_2ToThe23s ) , Four_2ToThe23s ) ;
ival = MaskedAssign ( CmpGtSIMD ( ival , fl4Abs ) , SubSIMD ( ival , Four_Ones ) , ival ) ;
return XorSIMD ( SubSIMD ( fl4Abs , ival ) , XorSIMD ( val , fl4Abs ) ) ; // restore sign bits
}
2022-07-08 00:55:01 +02:00
# ifndef SPU
// Disable on SPU for the moment as it generates a warning
// warning: dereferencing type-punned pointer will break strict-aliasing rules
// This is related to LoadAlignedSIMD( (float *) g_SIMD_lsbmask )
// LoadAlignedSIMD() under the hood is dereferencing the variable.
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 Mod2SIMD ( const fltx4 & val )
{
fltx4 fl4Abs = fabs ( val ) ;
fltx4 ival = SubSIMD ( AndSIMD ( LoadAlignedSIMD ( ( float * ) g_SIMD_lsbmask ) , AddSIMD ( fl4Abs , Four_2ToThe23s ) ) , Four_2ToThe23s ) ;
ival = MaskedAssign ( CmpGtSIMD ( ival , fl4Abs ) , SubSIMD ( ival , Four_Twos ) , ival ) ;
return XorSIMD ( SubSIMD ( fl4Abs , ival ) , XorSIMD ( val , fl4Abs ) ) ; // restore sign bits
}
2022-07-08 00:55:01 +02:00
# endif
2022-07-06 21:11:32 +02:00
FORCEINLINE fltx4 Mod2SIMDPositiveInput ( const fltx4 & val )
{
fltx4 ival = SubSIMD ( AndSIMD ( LoadAlignedSIMD ( g_SIMD_lsbmask ) , AddSIMD ( val , Four_2ToThe23s ) ) , Four_2ToThe23s ) ;
ival = MaskedAssign ( CmpGtSIMD ( ival , val ) , SubSIMD ( ival , Four_Twos ) , ival ) ;
return SubSIMD ( val , ival ) ;
}
// approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi.
// no range reduction is done - for values outside of 0..1 you won't like the results
FORCEINLINE fltx4 _SinEst01SIMD ( const fltx4 & val )
{
// really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between.
// sufficient for simple oscillation.
return MulSIMD ( val , SubSIMD ( Four_Fours , MulSIMD ( val , Four_Fours ) ) ) ;
}
FORCEINLINE fltx4 _Sin01SIMD ( const fltx4 & val )
{
// not a bad approximation : parabola always over-estimates. Squared parabola always
// underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin)
fltx4 fl4BadEst = MulSIMD ( val , SubSIMD ( Four_Fours , MulSIMD ( val , Four_Fours ) ) ) ;
return AddSIMD ( MulSIMD ( Four_Point225s , SubSIMD ( MulSIMD ( fl4BadEst , fl4BadEst ) , fl4BadEst ) ) , fl4BadEst ) ;
}
// full range useable implementations
FORCEINLINE fltx4 SinEst01SIMD ( const fltx4 & val )
{
fltx4 fl4Abs = fabs ( val ) ;
fltx4 fl4Reduced2 = Mod2SIMDPositiveInput ( fl4Abs ) ;
2022-07-08 00:55:01 +02:00
bi32x4 fl4OddMask = CmpGeSIMD ( fl4Reduced2 , Four_Ones ) ;
2022-07-06 21:11:32 +02:00
fltx4 fl4val = SubSIMD ( fl4Reduced2 , AndSIMD ( Four_Ones , fl4OddMask ) ) ;
fltx4 fl4Sin = _SinEst01SIMD ( fl4val ) ;
fl4Sin = XorSIMD ( fl4Sin , AndSIMD ( LoadAlignedSIMD ( g_SIMD_signmask ) , XorSIMD ( val , fl4OddMask ) ) ) ;
return fl4Sin ;
}
FORCEINLINE fltx4 Sin01SIMD ( const fltx4 & val )
{
fltx4 fl4Abs = fabs ( val ) ;
fltx4 fl4Reduced2 = Mod2SIMDPositiveInput ( fl4Abs ) ;
2022-07-08 00:55:01 +02:00
bi32x4 fl4OddMask = CmpGeSIMD ( fl4Reduced2 , Four_Ones ) ;
2022-07-06 21:11:32 +02:00
fltx4 fl4val = SubSIMD ( fl4Reduced2 , AndSIMD ( Four_Ones , fl4OddMask ) ) ;
fltx4 fl4Sin = _Sin01SIMD ( fl4val ) ;
fl4Sin = XorSIMD ( fl4Sin , AndSIMD ( LoadAlignedSIMD ( g_SIMD_signmask ) , XorSIMD ( val , fl4OddMask ) ) ) ;
return fl4Sin ;
}
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 NatExpSIMD ( const fltx4 & val ) // why is ExpSimd( x ) defined to be 2^x?
{
// need to write this. just stub with normal float implementation for now
fltx4 fl4Result ;
SubFloat ( fl4Result , 0 ) = exp ( SubFloat ( val , 0 ) ) ;
SubFloat ( fl4Result , 1 ) = exp ( SubFloat ( val , 1 ) ) ;
SubFloat ( fl4Result , 2 ) = exp ( SubFloat ( val , 2 ) ) ;
SubFloat ( fl4Result , 3 ) = exp ( SubFloat ( val , 3 ) ) ;
return fl4Result ;
}
2022-07-06 21:11:32 +02:00
// Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1)
FORCEINLINE fltx4 PreCalcBiasParameter ( const fltx4 & bias_parameter )
{
// convert perlin-style-bias parameter to the value right for the approximation
return SubSIMD ( ReciprocalSIMD ( bias_parameter ) , Four_Twos ) ;
}
FORCEINLINE fltx4 BiasSIMD ( const fltx4 & val , const fltx4 & precalc_param )
{
// similar to bias function except pass precalced bias value from calling PreCalcBiasParameter.
//!!speed!! use reciprocal est?
//!!speed!! could save one op by precalcing _2_ values
return DivSIMD ( val , AddSIMD ( MulSIMD ( precalc_param , SubSIMD ( Four_Ones , val ) ) , Four_Ones ) ) ;
}
//-----------------------------------------------------------------------------
// Box/plane test
// NOTE: The w component of emins + emaxs must be 1 for this to work
//-----------------------------------------------------------------------------
2022-07-08 00:55:01 +02:00
# ifndef SPU
// We don't need this on SPU right now
2022-07-06 21:11:32 +02:00
FORCEINLINE int BoxOnPlaneSideSIMD ( const fltx4 & emins , const fltx4 & emaxs , const cplane_t * p , float tolerance = 0.f )
{
fltx4 corners [ 2 ] ;
fltx4 normal = LoadUnalignedSIMD ( p - > normal . Base ( ) ) ;
fltx4 dist = ReplicateX4 ( - p - > dist ) ;
normal = SetWSIMD ( normal , dist ) ;
fltx4 t4 = ReplicateX4 ( tolerance ) ;
fltx4 negt4 = ReplicateX4 ( - tolerance ) ;
2022-07-08 00:55:01 +02:00
bi32x4 cmp = CmpGeSIMD ( normal , Four_Zeros ) ;
2022-07-06 21:11:32 +02:00
corners [ 0 ] = MaskedAssign ( cmp , emaxs , emins ) ;
corners [ 1 ] = MaskedAssign ( cmp , emins , emaxs ) ;
fltx4 dot1 = Dot4SIMD ( normal , corners [ 0 ] ) ;
fltx4 dot2 = Dot4SIMD ( normal , corners [ 1 ] ) ;
cmp = CmpGeSIMD ( dot1 , t4 ) ;
2022-07-08 00:55:01 +02:00
bi32x4 cmp2 = CmpGtSIMD ( negt4 , dot2 ) ;
2022-07-06 21:11:32 +02:00
fltx4 result = MaskedAssign ( cmp , Four_Ones , Four_Zeros ) ;
fltx4 result2 = MaskedAssign ( cmp2 , Four_Twos , Four_Zeros ) ;
result = AddSIMD ( result , result2 ) ;
intx4 sides ;
ConvertStoreAsIntsSIMD ( & sides , result ) ;
return sides [ 0 ] ;
}
2022-07-08 00:55:01 +02:00
// k-dop bounding volume. 26-dop bounds with 13 plane-pairs plus 3 other "arbitrary bounds". The arbitrary values could be used to hold type info, etc,
// which can compare against "for free"
class KDop32_t
{
public :
fltx4 m_Mins [ 4 ] ;
fltx4 m_Maxes [ 4 ] ;
FORCEINLINE bool Intersects ( KDop32_t const & other ) const ;
FORCEINLINE void operator | = ( KDop32_t const & other ) ;
FORCEINLINE bool IsEmpty ( void ) const ;
FORCEINLINE void Init ( void )
{
for ( int i = 0 ; i < ARRAYSIZE ( m_Mins ) ; i + + )
{
m_Mins [ i ] = Four_FLT_MAX ;
m_Maxes [ i ] = Four_Negative_FLT_MAX ;
}
}
// given a set of points, expand the kdop to contain them
void AddPointSet ( Vector3D const * pPoints , int nPnts ) ;
void CreateFromPointSet ( Vector3D const * pPoints , int nPnts ) ;
} ;
FORCEINLINE void KDop32_t : : operator | = ( KDop32_t const & other )
{
m_Mins [ 0 ] = MinSIMD ( m_Mins [ 0 ] , other . m_Mins [ 0 ] ) ;
m_Mins [ 1 ] = MinSIMD ( m_Mins [ 1 ] , other . m_Mins [ 1 ] ) ;
m_Mins [ 2 ] = MinSIMD ( m_Mins [ 2 ] , other . m_Mins [ 2 ] ) ;
m_Mins [ 3 ] = MinSIMD ( m_Mins [ 3 ] , other . m_Mins [ 3 ] ) ;
m_Maxes [ 0 ] = MaxSIMD ( m_Maxes [ 0 ] , other . m_Maxes [ 0 ] ) ;
m_Maxes [ 1 ] = MaxSIMD ( m_Maxes [ 1 ] , other . m_Maxes [ 1 ] ) ;
m_Maxes [ 2 ] = MaxSIMD ( m_Maxes [ 2 ] , other . m_Maxes [ 2 ] ) ;
m_Maxes [ 3 ] = MaxSIMD ( m_Maxes [ 3 ] , other . m_Maxes [ 3 ] ) ;
}
FORCEINLINE bool KDop32_t : : Intersects ( KDop32_t const & other ) const
{
bi32x4 c00 = CmpLeSIMD ( m_Mins [ 0 ] , other . m_Maxes [ 0 ] ) ;
bi32x4 c01 = CmpLeSIMD ( m_Mins [ 1 ] , other . m_Maxes [ 1 ] ) ;
bi32x4 c02 = CmpLeSIMD ( m_Mins [ 2 ] , other . m_Maxes [ 2 ] ) ;
bi32x4 c03 = CmpLeSIMD ( m_Mins [ 3 ] , other . m_Maxes [ 3 ] ) ;
bi32x4 c10 = CmpGeSIMD ( m_Maxes [ 0 ] , other . m_Mins [ 0 ] ) ;
bi32x4 c11 = CmpGeSIMD ( m_Maxes [ 1 ] , other . m_Mins [ 1 ] ) ;
bi32x4 c12 = CmpGeSIMD ( m_Maxes [ 2 ] , other . m_Mins [ 2 ] ) ;
bi32x4 c13 = CmpGeSIMD ( m_Maxes [ 3 ] , other . m_Mins [ 3 ] ) ;
bi32x4 a0 = AndSIMD ( AndSIMD ( c00 , c01 ) , AndSIMD ( c02 , c03 ) ) ;
bi32x4 a1 = AndSIMD ( AndSIMD ( c10 , c11 ) , AndSIMD ( c12 , c13 ) ) ;
return ! ( IsAnyZeros ( AndSIMD ( a1 , a0 ) ) ) ;
}
FORCEINLINE bool KDop32_t : : IsEmpty ( void ) const
{
bi32x4 c00 = CmpLtSIMD ( m_Maxes [ 0 ] , m_Mins [ 0 ] ) ;
bi32x4 c01 = CmpLtSIMD ( m_Maxes [ 1 ] , m_Mins [ 1 ] ) ;
bi32x4 c02 = CmpLtSIMD ( m_Maxes [ 2 ] , m_Mins [ 2 ] ) ;
bi32x4 c03 = CmpLtSIMD ( m_Maxes [ 3 ] , m_Mins [ 3 ] ) ;
return IsAnyTrue ( OrSIMD ( OrSIMD ( c00 , c01 ) , OrSIMD ( c02 , c03 ) ) ) ;
}
extern const fltx4 g_KDop32XDirs [ 4 ] ;
extern const fltx4 g_KDop32YDirs [ 4 ] ;
extern const fltx4 g_KDop32ZDirs [ 4 ] ;
# endif
#if 0
// FIXME!!! If we need a version of this that runs on 360, this is a work-in-progress version that hasn't been debugged.
# define _VEC_SWIZZLE_QUAT48_UNPACK (__vector unsigned char) { 16, 17, 0, 1, 16, 17, 2, 3, 16, 17, 4, 5, 16, 17, 6, 7 }
# define _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT (__vector unsigned int ) { 0, 0, 1, 0 }
// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
FORCEINLINE fltx4 UnpackQuaternion48SIMD ( const Quaternion48 * RESTRICT pVec )
{
// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
// w's sign bit.
// fltx4 q16s = XMLoadVector3((const void *)pVec);
fltx4 q16s = LoadUnaligned3SIMD ( ( const float * ) pVec ) ;
// fltx4 shift = *( fltx4 * )&g_SIMD_Quat48_Unpack_Shift; // load the aligned shift mask that we use to shuffle z.
// fltx4 permute = *( fltx4 * )&g_SIMD_Quat48_Unpack_Permute0; // load the permute word that shuffles x,y,z into their own words
bool wneg = pVec - > wneg ; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
// q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
q16s = vec_perm ( q16s , Four_Threes , _VEC_SWIZZLE_QUAT48_UNPACK ) ; // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
// q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
// q16s = vec_sl( *( u32x4 * )( void * )( &q16s ), _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT ); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
u32x4 tmp = IntShiftLeftWordSIMD ( * ( u32x4 * ) & q16s , _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT ) ;
q16s = * ( fltx4 * ) & tmp ;
// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
const fltx4 vUpkMul = SplatXSIMD ( g_SIMD_Quat48_Unpack_Magic_Constants ) ; // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
const fltx4 vUpkAdd = SplatYSIMD ( g_SIMD_Quat48_Unpack_Magic_Constants ) ;
/*
fltx4 ret = __vcfux ( q16s , 0 ) ; // convert from uint16 to floats.
// scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
ret = __vmaddfp ( ret , g_SIMD_Quat48_DivByU15 , Four_NegativeOnes ) ;
*/
// fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd );
fltx4 ret = vec_madd ( q16s , vUpkMul , vUpkAdd ) ;
// now, work out what w must be.
fltx4 dotxyz = Dot3SIMD ( ret , ret ) ; // all components are dot product of ret w/ self.
dotxyz = ClampVectorSIMD ( dotxyz , Four_Zeros , Four_Ones ) ;
fltx4 ww = SubSIMD ( Four_Ones , dotxyz ) ; // all components are 1 - dotxyz
ww = SqrtSIMD ( ww ) ; // all components are sqrt(1-dotxyz)
if ( wneg )
{
ret = SetWSIMD ( ret , NegSIMD ( ww ) ) ;
// ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret
}
else
{
ret = SetWSIMD ( ret , ww ) ;
// ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret
}
return ret ;
}
# endif
// These are not optimized right now for some platforms. We should be able to shuffle the values in some platforms.
// As the methods are hard-coded we can actually avoid loading memory to do the transfer.
// We should be able to create all versions.
FORCEINLINE fltx4 SetWFromXSIMD ( const fltx4 & a , const fltx4 & x )
{
fltx4 value = SplatXSIMD ( x ) ;
return SetWSIMD ( a , value ) ;
}
FORCEINLINE fltx4 SetWFromYSIMD ( const fltx4 & a , const fltx4 & y )
{
fltx4 value = SplatYSIMD ( y ) ;
return SetWSIMD ( a , value ) ;
}
FORCEINLINE fltx4 SetWFromZSIMD ( const fltx4 & a , const fltx4 & z )
{
fltx4 value = SplatZSIMD ( z ) ;
return SetWSIMD ( a , value ) ;
}
FORCEINLINE fltx4 CrossProductSIMD ( const fltx4 & A , const fltx4 & B )
{
# if defined( _X360 )
return XMVector3Cross ( A , B ) ;
# elif defined( _WIN32 )
fltx4 A1 = _mm_shuffle_ps ( A , A , MM_SHUFFLE_REV ( 1 , 2 , 0 , 3 ) ) ;
fltx4 B1 = _mm_shuffle_ps ( B , B , MM_SHUFFLE_REV ( 2 , 0 , 1 , 3 ) ) ;
fltx4 Result1 = MulSIMD ( A1 , B1 ) ;
fltx4 A2 = _mm_shuffle_ps ( A , A , MM_SHUFFLE_REV ( 2 , 0 , 1 , 3 ) ) ;
fltx4 B2 = _mm_shuffle_ps ( B , B , MM_SHUFFLE_REV ( 1 , 2 , 0 , 3 ) ) ;
fltx4 Result2 = MulSIMD ( A2 , B2 ) ;
return SubSIMD ( Result1 , Result2 ) ;
# elif defined(_PS3)
/*
fltx4 perm1 = ( vector unsigned char ) { 0x04 , 0x05 , 0x06 , 0x07 , 0x08 , 0x09 , 0x0a , 0x0b , 0x00 , 0x01 , 0x02 , 0x03 , 0x0c , 0x0d , 0x0e , 0x0f } ;
fltx4 perm2 = ( vector unsigned char ) { 0x08 , 0x09 , 0x0a , 0x0b , 0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0x0c , 0x0d , 0x0e , 0x0f } ;
fltx4 A1 = __vpermwi ( A , A , perm1 ) ;
fltx4 A2 = __vpermwi ( B , B , perm2 ) ;
fltx4 Result1 = MulSIMD ( A1 , B1 ) ;
fltx4 A2 = __vpermwi ( A , A , perm2 ) ;
fltx4 B2 = __vpermwi ( B , B , perm1 ) ;
return MsubSIMD ( A2 , B2 , Result1 ) ;
*/
return _vmathVfCross ( A , B ) ;
# else
fltx4 CrossVal ;
SubFloat ( CrossVal , 0 ) = SubFloat ( A , 1 ) * SubFloat ( B , 2 ) - SubFloat ( A , 2 ) * SubFloat ( B , 1 ) ;
SubFloat ( CrossVal , 1 ) = SubFloat ( A , 2 ) * SubFloat ( B , 0 ) - SubFloat ( A , 0 ) * SubFloat ( B , 2 ) ;
SubFloat ( CrossVal , 2 ) = SubFloat ( A , 0 ) * SubFloat ( B , 1 ) - SubFloat ( A , 1 ) * SubFloat ( B , 0 ) ;
SubFloat ( CrossVal , 3 ) = 0 ;
return CrossVal ;
# endif
}
inline const fltx4 Length3SIMD ( const fltx4 vec )
{
fltx4 scLengthSqr = Dot3SIMD ( vec , vec ) ;
bi32x4 isSignificant = CmpGtSIMD ( scLengthSqr , Four_Epsilons ) ;
fltx4 scLengthInv = ReciprocalSqrtSIMD ( scLengthSqr ) ;
return AndSIMD ( isSignificant , MulSIMD ( scLengthInv , scLengthSqr ) ) ;
}
inline const fltx4 Normalized3SIMD ( const fltx4 vec )
{
fltx4 scLengthSqr = Dot3SIMD ( vec , vec ) ;
bi32x4 isSignificant = CmpGtSIMD ( scLengthSqr , Four_Epsilons ) ;
fltx4 scLengthInv = ReciprocalSqrtSIMD ( scLengthSqr ) ;
return AndSIMD ( isSignificant , MulSIMD ( vec , scLengthInv ) ) ;
}
// Some convenience operator overloads, which are just aliasing the functions above.
2022-09-09 19:47:31 +02:00
// Unnecessary on 360, as you already have them from xboxmath.h
// Component wise add
2023-04-01 22:13:58 +02:00
# if !defined (COMPILER_GCC) && !defined (COMPILER_CLANG)
2022-07-08 00:55:01 +02:00
FORCEINLINE fltx4 operator + = ( fltx4 & a , FLTX4 b )
{
a = AddSIMD ( a , b ) ;
return a ;
}
FORCEINLINE fltx4 operator - = ( fltx4 & a , FLTX4 b )
{
a = SubSIMD ( a , b ) ;
return a ;
}
FORCEINLINE fltx4 operator * = ( fltx4 & a , FLTX4 b )
{
a = MulSIMD ( a , b ) ;
return a ;
}
# endif
2022-07-06 21:11:32 +02:00
# endif // _ssemath_h