mathlib_base.cpp #1

//========= Copyright  1996-2005, Valve Corporation, All rights reserved. ============//
//
// Purpose: Math primitives.
//
// $Workfile:     $
// $NoKeywords: $
//=============================================================================//

#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)


/// FIXME: As soon as all references to mathlib.c are gone, include it in here

#include <math.h>
#include <float.h>	// Needed for FLT_EPSILON

#include "basetypes.h"
#include <memory.h>
#include "tier0/dbg.h"

#include "tier0/vprof.h"
//#define _VPROF_MATHLIB

#pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
#pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"

#include "mathlib.h"
#include "amd3dx.h"
#include "vector.h"

// memdbgon must be the last include file in a .cpp file!!!
#include "tier0/memdbgon.h"

static bool s_bMathlibInitialized = false;


#ifdef PARANOID
// User must provide an implementation of Sys_Error()
void Sys_Error (char *error, ...);
#endif
#ifdef _XBOX
#include <xmmintrin.h>
#endif

const Vector vec3_origin(0,0,0);
const QAngle vec3_angle(0,0,0);
const Vector vec3_invalid( FLT_MAX, FLT_MAX, FLT_MAX );
const int nanmask = 255<<23;


// Math routines which have alternate versions for MMX/SSE/3DNOW in MATHLIB.C
// void  _VectorMA( const float *start, float scale, const float *direction, float *dest );

#ifdef PFN_VECTORMA
void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
#endif

//-----------------------------------------------------------------------------
// Macros and constants required by some of the SSE assembly:
//-----------------------------------------------------------------------------

#ifdef _WIN32
#define _PS_EXTERN_CONST(Name, Val) \
const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }

#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \

#define _EPI32_CONST(Name, Val) \
static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }

#define _PS_CONST(Name, Val) \
	static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }

#elif _LINUX
#define _PS_EXTERN_CONST(Name, Val) \
const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }

#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
const __attribute__((aligned(16))) Type _ps_##Name[4] = { Val, Val, Val, Val }; \

#define _EPI32_CONST(Name, Val) \
static const __attribute__((aligned(16))) int32 _epi32_##Name[4] = { Val, Val, Val, Val }

#define _PS_CONST(Name, Val) \
	static const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }

#endif

_PS_EXTERN_CONST(am_0, 0.0f);
_PS_EXTERN_CONST(am_1, 1.0f);
_PS_EXTERN_CONST(am_m1, -1.0f);
_PS_EXTERN_CONST(am_0p5, 0.5f);
_PS_EXTERN_CONST(am_1p5, 1.5f);
_PS_EXTERN_CONST(am_pi, (float)M_PI);
_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);

_EPI32_CONST(1, 1);
_EPI32_CONST(2, 2);

_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
_PS_CONST(sincos_p3, -0.468175413106023168e-2f);

static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };

//-----------------------------------------------------------------------------
// Standard C implementations of optimized routines:
//-----------------------------------------------------------------------------
float _sqrtf(float _X)
{
	Assert( s_bMathlibInitialized );
	return sqrtf(_X); 
}

float _rsqrtf(float x)
{
	Assert( s_bMathlibInitialized );

	return 1.f / _sqrtf( x );
}

float FASTCALL _VectorNormalize (Vector& vec)
{
#ifdef _VPROF_MATHLIB
	VPROF_BUDGET( "_VectorNormalize", "Mathlib" );
#endif
	Assert( s_bMathlibInitialized );
	float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);

	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
	float iradius = 1.f / ( radius + FLT_EPSILON );
	
	vec.x *= iradius;
	vec.y *= iradius;
	vec.z *= iradius;
	
	return radius;
}

// TODO: Add fast C VectorNormalizeFast.
// Perhaps use approximate rsqrt trick, if the accuracy isn't too bad.
void FASTCALL _VectorNormalizeFast (Vector& vec)
{
	Assert( s_bMathlibInitialized );

	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
	float iradius = 1.f / ( sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z) + FLT_EPSILON );
	
	vec.x *= iradius;
	vec.y *= iradius;
	vec.z *= iradius;
	
}

float _InvRSquared(const float* v)
{
	Assert( s_bMathlibInitialized );
	float	r2 = DotProduct(v, v);
	return r2 < 1.f ? 1.f : 1/r2;
}

// Math routines done in optimized assembly math package routines

//-----------------------------------------------------------------------------
// 3D Now Implementations of optimized routines:
//-----------------------------------------------------------------------------
float _3DNow_Sqrt(float x)
{
	Assert( s_bMathlibInitialized );
	float	root = 0.f;
#ifdef _WIN32
	_asm
	{
		femms
		movd		mm0, x
		PFRSQRT		(mm1,mm0)
		punpckldq	mm0, mm0
		PFMUL		(mm0, mm1)
		movd		root, mm0
		femms
	}
#elif _LINUX
 	__asm __volatile__( "femms" );
 	__asm __volatile__
	(
		"pfrsqrt    %y0, %y1 \n\t"
		"punpckldq   %y1, %y1 \n\t"
		"pfmul      %y1, %y0 \n\t"
		: "=y" (root), "=y" (x)
 		:"0" (x)
 	);
 	__asm __volatile__( "femms" );
#else
#error
#endif

	return root;
}

// NJS FIXME: Need to test Recripricol squareroot performance and accuraccy
// on AMD's before using the specialized instruction.
float _3DNow_RSqrt(float x)
{
	Assert( s_bMathlibInitialized );

	return 1.f / _3DNow_Sqrt(x);
}


float FASTCALL _3DNow_VectorNormalize (Vector& vec)
{
	Assert( s_bMathlibInitialized );
	float *v = &vec[0];
	float	radius = 0.f;

	if ( v[0] || v[1] || v[2] )
	{
#ifdef _WIN32
	_asm
		{
			mov			eax, v
			femms
			movq		mm0, QWORD PTR [eax]
			movd		mm1, DWORD PTR [eax+8]
			movq		mm2, mm0
			movq		mm3, mm1
			PFMUL		(mm0, mm0)
			PFMUL		(mm1, mm1)
			PFACC		(mm0, mm0)
			PFADD		(mm1, mm0)
			PFRSQRT		(mm0, mm1)
			punpckldq	mm1, mm1
			PFMUL		(mm1, mm0)
			PFMUL		(mm2, mm0)
			PFMUL		(mm3, mm0)
			movq		QWORD PTR [eax], mm2
			movd		DWORD PTR [eax+8], mm3
			movd		radius, mm1
			femms
		}
#elif _LINUX	
		long long a,c;
    		int b,d;
    		memcpy(&a,&vec[0],sizeof(a));
    		memcpy(&b,&vec[2],sizeof(b));
    		memcpy(&c,&vec[0],sizeof(c));
    		memcpy(&d,&vec[2],sizeof(d));

      		__asm __volatile__( "femms" );
        	__asm __volatile__
        	(
        		"pfmul           %y3, %y3\n\t"
        		"pfmul           %y0, %y0 \n\t"
        		"pfacc           %y3, %y3 \n\t"
        		"pfadd           %y3, %y0 \n\t"
        		"pfrsqrt         %y0, %y3 \n\t"
        		"punpckldq       %y0, %y0 \n\t"
        		"pfmul           %y3, %y0 \n\t"
        		"pfmul           %y3, %y2 \n\t"
        		"pfmul           %y3, %y1 \n\t"
        		: "=y" (radius), "=y" (c), "=y" (d)
        		: "y" (a), "0" (b), "1" (c), "2" (d)
        	);
      		memcpy(&vec[0],&c,sizeof(c));
      		memcpy(&vec[2],&d,sizeof(d));		
        	__asm __volatile__( "femms" );

#else
#error
#endif
	}
    return radius;
}

void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec)
{
	_3DNow_VectorNormalize( vec );
}

// JAY: This complains with the latest processor pack
#pragma warning(disable: 4730)

float _3DNow_InvRSquared(const float* v)
{
	Assert( s_bMathlibInitialized );
	float	r2 = 1.f;
#ifdef _WIN32
	_asm { // AMD 3DNow only routine
		mov			eax, v
		femms
		movq		mm0, QWORD PTR [eax]
		movd		mm1, DWORD PTR [eax+8]
		movd		mm2, [r2]
		PFMUL		(mm0, mm0)
		PFMUL		(mm1, mm1)
		PFACC		(mm0, mm0)
		PFADD		(mm1, mm0)
		PFMAX		(mm1, mm2)
		PFRCP		(mm0, mm1)
		movd		[r2], mm0
		femms
	}
#elif _LINUX
		long long a,c;
    		int b;
    		memcpy(&a,&v[0],sizeof(a));
    		memcpy(&b,&v[2],sizeof(b));
    		memcpy(&c,&v[0],sizeof(c));

      		__asm __volatile__( "femms" );
        	__asm __volatile__
        	(
			"PFMUL          %y2, %y2 \n\t"
                        "PFMUL          %y3, %y3 \n\t"
                        "PFACC          %y2, %y2 \n\t"
                        "PFADD          %y2, %y3 \n\t"
                        "PFMAX          %y3, %y4 \n\t"
                        "PFRCP          %y3, %y2 \n\t"
                        "movq           %y2, %y0 \n\t"
        		: "=y" (r2)
        		: "0" (r2), "y" (a), "y" (b), "y" (c)
        	);
        	__asm __volatile__( "femms" );
#else
#error
#endif

	return r2;
}

//-----------------------------------------------------------------------------
// SSE implementations of optimized routines:
//-----------------------------------------------------------------------------
float _SSE_Sqrt(float x)
{
	Assert( s_bMathlibInitialized );
	float	root = 0.f;
#ifdef _WIN32
	_asm
	{
		sqrtss		xmm0, x
		movss		root, xmm0
	}
#elif _LINUX
	__asm__ __volatile__(
		"movss %1,%%xmm2\n"
		"sqrtss %%xmm2,%%xmm1\n"
		"movss %%xmm1,%0"
       		: "=m" (root)
		: "m" (x)
	);
#endif
	return root;
}

//	Single iteration NewtonRaphson reciprocal square root:
// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
// Very low error, and fine to use in place of 1.f / sqrtf(x).	
#if 0
	float _SSE_RSqrtAccurate(float x)
	{
		Assert( s_bMathlibInitialized );

		float rroot;
		_asm
		{
			rsqrtss	xmm0, x
			movss	rroot, xmm0
		}

		return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
	}
#else

	// Intel / Kipps SSE RSqrt.  Significantly faster than above.
	inline float _SSE_RSqrtAccurate(float a)
	{
		float x;
		float half = 0.5f;
		float three = 3.f;

#ifdef _WIN32
		__asm
		{
			movss   xmm3, a;
			movss   xmm1, half;
			movss   xmm2, three;
			rsqrtss xmm0, xmm3;

			mulss   xmm3, xmm0;
			mulss   xmm1, xmm0;
			mulss   xmm3, xmm0;
			subss   xmm2, xmm3;
			mulss   xmm1, xmm2;

			movss   x,    xmm1;
		}
#elif _LINUX
	 __asm__ __volatile__(
		"movss   %1, %%xmm3 \n\t"
                "movss   %2, %%xmm1 \n\t"
                "movss   %3, %%xmm2 \n\t"
                "rsqrtss %%xmm3, %%xmm0 \n\t"
                "mulss   %%xmm0, %%xmm3 \n\t"
                "mulss   %%xmm0, %%xmm1 \n\t"
                "mulss   %%xmm0, %%xmm3 \n\t"
                "subss   %%xmm3, %%xmm2 \n\t"
                "mulss   %%xmm2, %%xmm1 \n\t"
                "movss   %%xmm1, %0 \n\t"
		: "=m" (x)
		: "m" (a), "m" (half), "m" (three)
	);
#else
#error
#endif

		return x;
	}
#endif

// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
// or so, so ok for closed transforms.  (ie, computing lighting normals)
inline float _SSE_RSqrtFast(float x)
{
	Assert( s_bMathlibInitialized );

	float rroot;
#ifdef _WIN32
	_asm
	{
		rsqrtss	xmm0, x
		movss	rroot, xmm0
	}
#elif _LINUX
	 __asm__ __volatile__(
		"rsqrtss %1, %%xmm0 \n\t"
		"movss %%xmm0, %0 \n\t"
		: "=m" (x)
		: "m" (rroot)
		: "%xmm0"
	);
#else
#error
#endif

	return rroot;
}

float FASTCALL _SSE_VectorNormalize (Vector& vec)
{
	Assert( s_bMathlibInitialized );

	// NOTE: This is necessary to prevent an memory overwrite...
	// sice vec only has 3 floats, we can't "movaps" directly into it.
#ifdef _WIN32
	__declspec(align(16)) float result[4];
#elif _LINUX
	__attribute__((aligned(16))) float result[4];
#endif

	float *v = &vec[0];
	float *r = &result[0];

	float	radius = 0.f;
	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
	// be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
	if ( v[0] || v[1] || v[2] )
	{
#ifdef _WIN32
	_asm
		{
			mov			eax, v
			mov			edx, r
#ifdef ALIGNED_VECTOR
			movaps		xmm4, [eax]			// r4 = vx, vy, vz, X
			movaps		xmm1, xmm4			// r1 = r4
#else
			movups		xmm4, [eax]			// r4 = vx, vy, vz, X
			movaps		xmm1, xmm4			// r1 = r4
#endif
			mulps		xmm1, xmm4			// r1 = vx * vx, vy * vy, vz * vz, X
			movhlps		xmm3, xmm1			// r3 = vz * vz, X, X, X
			movaps		xmm2, xmm1			// r2 = r1
			shufps		xmm2, xmm2, 1		// r2 = vy * vy, X, X, X
			addss		xmm1, xmm2			// r1 = (vx * vx) + (vy * vy), X, X, X
			addss		xmm1, xmm3			// r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
			sqrtss		xmm1, xmm1			// r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
			movss		radius, xmm1		// radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
			rcpss		xmm1, xmm1			// r1 = 1/radius, X, X, X
			shufps		xmm1, xmm1, 0		// r1 = 1/radius, 1/radius, 1/radius, X
			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
		}
#elif _LINUX
		__asm__ __volatile__(
#ifdef ALIGNED_VECTOR
                        "movaps          %2, %%xmm4 \n\t"
                        "movaps          %%xmm4, %%xmm1 \n\t"
#else
                        "movups          %2, %%xmm4 \n\t"
                        "movaps          %%xmm4, %%xmm1 \n\t"
#endif
                        "mulps           %%xmm4, %%xmm1 \n\t"
                        "movhlps         %%xmm1, %%xmm3 \n\t"
                        "movaps          %%xmm1, %%xmm2 \n\t"
                        "shufps          $1, %%xmm2, %%xmm2 \n\t"
                        "addss           %%xmm2, %%xmm1 \n\t"
                        "addss           %%xmm3, %%xmm1 \n\t"
                        "sqrtss          %%xmm1, %%xmm1 \n\t"
                        "movss           %%xmm1, %0 \n\t"
                        "rcpss           %%xmm1, %%xmm1 \n\t"
                        "shufps          $0, %%xmm1, %%xmm1 \n\t"
                        "mulps           %%xmm1, %%xmm4 \n\t"
                        "movaps          %%xmm4, %1 \n\t"
                        : "=m" (radius), "=m" (result)
                        : "m" (*v)
 		);
#else
#error
#endif
		vec.x = result[0];
		vec.y = result[1];
		vec.z = result[2];

	}

	return radius;
}

void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
{
	float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );

	vec.x *= ool;
	vec.y *= ool;
	vec.z *= ool;
}

float _SSE_InvRSquared(const float* v)
{
	float	inv_r2 = 1.f;
#ifdef _WIN32
	_asm { // Intel SSE only routine
		mov			eax, v
		movss		xmm5, inv_r2		// x5 = 1.0, 0, 0, 0
#ifdef ALIGNED_VECTOR
		movaps		xmm4, [eax]			// x4 = vx, vy, vz, X
#else
		movups		xmm4, [eax]			// x4 = vx, vy, vz, X
#endif
		movaps		xmm1, xmm4			// x1 = x4
		mulps		xmm1, xmm4			// x1 = vx * vx, vy * vy, vz * vz, X
		movhlps		xmm3, xmm1			// x3 = vz * vz, X, X, X
		movaps		xmm2, xmm1			// x2 = x1
		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
		movss		inv_r2, xmm0		// inv_r2 = x0
	}
#elif _LINUX
		__asm__ __volatile__(
#ifdef ALIGNED_VECTOR
                        "movaps          %1, %%xmm4 \n\t"
#else
                        "movups          %1, %%xmm4 \n\t"
#endif
                        "movaps          %%xmm4, %%xmm1 \n\t"
                        "mulps           %%xmm4, %%xmm1 \n\t"
			"movhlps         %%xmm1, %%xmm3 \n\t"
			"movaps          %%xmm1, %%xmm2 \n\t"
                        "shufps          $1, %%xmm2, %%xmm2 \n\t"
                        "addss           %%xmm2, %%xmm1 \n\t"
                        "addss           %%xmm3, %%xmm1 \n\t"
			"maxss           %%xmm5, %%xmm1 \n\t"
                        "rcpss           %%xmm1, %%xmm0 \n\t"
			"movss           %%xmm0, %0 \n\t" 
                        : "=m" (inv_r2)
                        : "m" (*v), "0" (inv_r2)
 		);

#else
#error
#endif

	return inv_r2;
}

#ifdef _WIN32

void _SSE_SinCos(float x, float* s, float* c)
{

	float t4, t8, t12;

	__asm
	{
		movss	xmm0, x
		movss	t12, xmm0
		movss	xmm1, _ps_am_inv_sign_mask
		mov		eax, t12
		mulss	xmm0, _ps_am_2_o_pi
		andps	xmm0, xmm1
		and		eax, 0x80000000

		cvttss2si	edx, xmm0
		mov		ecx, edx
		mov		t12, esi
		mov		esi, edx
		add		edx, 0x1	
		shl		ecx, (31 - 1)
		shl		edx, (31 - 1)

		movss	xmm4, _ps_am_1
		cvtsi2ss	xmm3, esi
		mov		t8, eax
		and		esi, 0x1

		subss	xmm0, xmm3
		movss	xmm3, _sincos_inv_masks[esi * 4]
		minss	xmm0, xmm4

		subss	xmm4, xmm0

		movss	xmm6, xmm4
		andps	xmm4, xmm3
		and		ecx, 0x80000000
		movss	xmm2, xmm3
		andnps	xmm3, xmm0
		and		edx, 0x80000000
		movss	xmm7, t8
		andps	xmm0, xmm2
		mov		t8, ecx
		mov		t4, edx
		orps	xmm4, xmm3

		mov		eax, s     //mov eax, [esp + 4 + 16]
		mov		edx, c //mov edx, [esp + 4 + 16 + 4]

		andnps	xmm2, xmm6
		orps	xmm0, xmm2

		movss	xmm2, t8
		movss	xmm1, xmm0
		movss	xmm5, xmm4
		xorps	xmm7, xmm2
		movss	xmm3, _ps_sincos_p3
		mulss	xmm0, xmm0
		mulss	xmm4, xmm4
		movss	xmm2, xmm0
		movss	xmm6, xmm4
		orps	xmm1, xmm7
		movss	xmm7, _ps_sincos_p2
		mulss	xmm0, xmm3
		mulss	xmm4, xmm3
		movss	xmm3, _ps_sincos_p1
		addss	xmm0, xmm7
		addss	xmm4, xmm7
		movss	xmm7, _ps_sincos_p0
		mulss	xmm0, xmm2
		mulss	xmm4, xmm6
		addss	xmm0, xmm3
		addss	xmm4, xmm3
		movss	xmm3, t4
		mulss	xmm0, xmm2
		mulss	xmm4, xmm6
		orps	xmm5, xmm3
		mov		esi, t12
		addss	xmm0, xmm7
		addss	xmm4, xmm7
		mulss	xmm0, xmm1
		mulss	xmm4, xmm5

		// use full stores since caller might reload with full loads
		movss	[eax], xmm0
		movss	[edx], xmm4

	}
}


float _SSE_cos( float x)
{
	float temp;
	__asm
	{
		movss	xmm0, x
		movss	xmm1, _ps_am_inv_sign_mask
		andps	xmm0, xmm1
		addss	xmm0, _ps_am_pi_o_2
		mulss	xmm0, _ps_am_2_o_pi

		cvttss2si	ecx, xmm0
		movss	xmm5, _ps_am_1
		mov		edx, ecx
		shl		edx, (31 - 1)
		cvtsi2ss	xmm1, ecx
		and		edx, 0x80000000
		and		ecx, 0x1

		subss	xmm0, xmm1
		movss	xmm6, _sincos_masks[ecx * 4]
		minss	xmm0, xmm5

		movss	xmm1, _ps_sincos_p3
		subss	xmm5, xmm0

		andps	xmm5, xmm6
		movss	xmm7, _ps_sincos_p2
		andnps	xmm6, xmm0
		mov		temp, edx
		orps	xmm5, xmm6
		movss	xmm0, xmm5

		mulss	xmm5, xmm5
		movss	xmm4, _ps_sincos_p1
		movss	xmm2, xmm5
		mulss	xmm5, xmm1
		movss	xmm1, _ps_sincos_p0
		addss	xmm5, xmm7
		mulss	xmm5, xmm2
		movss	xmm3, temp
		addss	xmm5, xmm4
		mulss	xmm5, xmm2
		orps	xmm0, xmm3
		addss	xmm5, xmm1
		mulss	xmm0, xmm5
		
		movss   x,    xmm0

	}

	return x;
}


//-----------------------------------------------------------------------------
// SSE2 implementations of optimized routines:
//-----------------------------------------------------------------------------
#ifndef _XBOX
void _SSE2_SinCos(float x, float* s, float* c)  // any x
{
	__asm
	{
		movss	xmm0, x
		movaps	xmm7, xmm0
		movss	xmm1, _ps_am_inv_sign_mask
		movss	xmm2, _ps_am_sign_mask
		movss	xmm3, _ps_am_2_o_pi
		andps	xmm0, xmm1
		andps	xmm7, xmm2
		mulss	xmm0, xmm3

		pxor	xmm3, xmm3
		movd	xmm5, _epi32_1
		movss	xmm4, _ps_am_1

		cvttps2dq	xmm2, xmm0
		pand	xmm5, xmm2
		movd	xmm1, _epi32_2
		pcmpeqd	xmm5, xmm3
		movd	xmm3, _epi32_1
		cvtdq2ps	xmm6, xmm2
		paddd	xmm3, xmm2
		pand	xmm2, xmm1
		pand	xmm3, xmm1
		subss	xmm0, xmm6
		pslld	xmm2, (31 - 1)
		minss	xmm0, xmm4

		mov		eax, s     // mov eax, [esp + 4 + 16]
		mov		edx, c	   // mov edx, [esp + 4 + 16 + 4]

		subss	xmm4, xmm0
		pslld	xmm3, (31 - 1)

		movaps	xmm6, xmm4
		xorps	xmm2, xmm7
		movaps	xmm7, xmm5
		andps	xmm6, xmm7
		andnps	xmm7, xmm0
		andps	xmm0, xmm5
		andnps	xmm5, xmm4
		movss	xmm4, _ps_sincos_p3
		orps	xmm6, xmm7
		orps	xmm0, xmm5
		movss	xmm5, _ps_sincos_p2

		movaps	xmm1, xmm0
		movaps	xmm7, xmm6
		mulss	xmm0, xmm0
		mulss	xmm6, xmm6
		orps	xmm1, xmm2
		orps	xmm7, xmm3
		movaps	xmm2, xmm0
		movaps	xmm3, xmm6
		mulss	xmm0, xmm4
		mulss	xmm6, xmm4
		movss	xmm4, _ps_sincos_p1
		addss	xmm0, xmm5
		addss	xmm6, xmm5
		movss	xmm5, _ps_sincos_p0
		mulss	xmm0, xmm2
		mulss	xmm6, xmm3
		addss	xmm0, xmm4
		addss	xmm6, xmm4
		mulss	xmm0, xmm2
		mulss	xmm6, xmm3
		addss	xmm0, xmm5
		addss	xmm6, xmm5
		mulss	xmm0, xmm1
		mulss	xmm6, xmm7

		// use full stores since caller might reload with full loads
		movss	[eax], xmm0
		movss	[edx], xmm6

	}
}
#endif

#ifndef _XBOX
float _SSE2_cos(float x)  
{
	__asm
	{
		movss	xmm0, x
		movss	xmm1, _ps_am_inv_sign_mask
		movss	xmm2, _ps_am_pi_o_2
		movss	xmm3, _ps_am_2_o_pi
		andps	xmm0, xmm1
		addss	xmm0, xmm2
		mulss	xmm0, xmm3

		pxor	xmm3, xmm3
		movd	xmm5, _epi32_1
		movss	xmm4, _ps_am_1
		cvttps2dq	xmm2, xmm0
		pand	xmm5, xmm2
		movd	xmm1, _epi32_2
		pcmpeqd	xmm5, xmm3
		cvtdq2ps	xmm6, xmm2
		pand	xmm2, xmm1
		pslld	xmm2, (31 - 1)

		subss	xmm0, xmm6
		movss	xmm3, _ps_sincos_p3
		minss	xmm0, xmm4
		subss	xmm4, xmm0
		andps	xmm0, xmm5
		andnps	xmm5, xmm4
		orps	xmm0, xmm5

		movaps	xmm1, xmm0
		movss	xmm4, _ps_sincos_p2
		mulss	xmm0, xmm0
		movss	xmm5, _ps_sincos_p1
		orps	xmm1, xmm2
		movaps	xmm7, xmm0
		mulss	xmm0, xmm3
		movss	xmm6, _ps_sincos_p0
		addss	xmm0, xmm4
		mulss	xmm0, xmm7
		addss	xmm0, xmm5
		mulss	xmm0, xmm7
		addss	xmm0, xmm6
		mulss	xmm0, xmm1
		movss   x,    xmm0
	}

	return x;
}
#endif
#elif _LINUX
#warning "_SSE_sincos,_SSE_cos,_SSE2_cos,_SSE_sincos NOT implemented!"
#else
#error
#endif

//-----------------------------------------------------------------------------
// Function pointers selecting the appropriate implementation from above:
//-----------------------------------------------------------------------------
float (*pfSqrt)(float x)  = _sqrtf;
float (*pfRSqrt)(float x) = _rsqrtf;
float (*pfRSqrtFast)(float x) = _rsqrtf;
float (FASTCALL *pfVectorNormalize)(Vector& v) = _VectorNormalize;
void  (FASTCALL *pfVectorNormalizeFast)(Vector& v) = _VectorNormalizeFast;
float (*pfInvRSquared)(const float* v) = _InvRSquared;
void  (*pfFastSinCos)(float x, float* s, float* c) = SinCos;
float (*pfFastCos)(float x) = cosf;


float SinCosTable[SIN_TABLE_SIZE];

void InitSinCosTable()
{
	for( int i = 0; i < SIN_TABLE_SIZE; i++ )
	{
		SinCosTable[i] = sin(i * 2.0 * M_PI / SIN_TABLE_SIZE);
	}
}

#ifdef PARANOID
/*
==================
BOPS_Error

Split out like this for ASM to call.
==================
*/
void BOPS_Error (void)
{
	Sys_Error ("BoxOnPlaneSide:  Bad signbits");
}
#endif


#if	!id386 && !_LINUX

/*
==================
BoxOnPlaneSide

Returns 1, 2, or 1 + 2
==================
*/
int BoxOnPlaneSide (const float *emins, const float *emaxs, const cplane_t *p)
{
	Assert( s_bMathlibInitialized );
	float	dist1, dist2;
	int		sides;

#if 0	// this is done by the BOX_ON_PLANE_SIDE macro before calling this
		// function
// fast axial cases
	if (p->type < 3)
	{
		if (p->dist <= emins[p->type])
			return 1;
		if (p->dist >= emaxs[p->type])
			return 2;
		return 3;
	}
#endif
	
	// general case
	switch (p->signbits)
	{
	case 0:
		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
		dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
		break;
	case 1:
		dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
		break;
	case 2:
		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
		dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
		break;
	case 3:
		dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
		break;
	case 4:
		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
		dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
		break;
	case 5:
		dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
		break;
	case 6:
		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
		dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
		break;
	case 7:
		dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
		break;
	default:
		dist1 = dist2 = 0;		// shut up compiler
#ifdef PARANOID
		BOPS_Error ();
#endif
		break;
	}

#if 0
	int		i;
	float	corners[2][3];

	for (i=0 ; i<3 ; i++)
	{
		if (plane->normal[i] < 0)
		{
			corners[0][i] = emins[i];
			corners[1][i] = emaxs[i];
		}
		else
		{
			corners[1][i] = emins[i];
			corners[0][i] = emaxs[i];
		}
	}
	dist = DotProduct (plane->normal, corners[0]) - plane->dist;
	dist2 = DotProduct (plane->normal, corners[1]) - plane->dist;
	sides = 0;
	if (dist1 >= 0)
		sides = 1;
	if (dist2 < 0)
		sides |= 2;

#endif

	sides = 0;
	if (dist1 >= p->dist)
		sides = 1;
	if (dist2 < p->dist)
		sides |= 2;

#ifdef PARANOID
if (sides == 0)
	Sys_Error ("BoxOnPlaneSide: sides==0");
#endif

	return sides;
}

#endif
//#endif

qboolean VectorsEqual( const float *v1, const float *v2 )
{
	Assert( s_bMathlibInitialized );
	return ( ( v1[0] == v2[0] ) &&
		     ( v1[1] == v2[1] ) &&
			 ( v1[2] == v2[2] ) );
}



int LineSphereIntersection(
	const Vector &vSphereCenter,
	const float fSphereRadius,
	const Vector &vLinePt,
	const Vector &vLineDir,
	float *fIntersection1,
	float *fIntersection2)
{
	Assert( s_bMathlibInitialized );
	// Line = P + Vt
	// Sphere = r (assume we've translated to origin)
	// (P + Vt)^2 = r^2
	// VVt^2 + 2PVt + (PP - r^2)
	// Solve as quadratic:  (-b  +/-  sqrt(b^2 - 4ac)) / 2a
	// If (b^2 - 4ac) is < 0 there is no solution.
	// If (b^2 - 4ac) is = 0 there is one solution (a case this function doesn't support).
	// If (b^2 - 4ac) is > 0 there are two solutions.
	Vector P;
	float a, b, c, sqr, insideSqr;


	// Translate sphere to origin.
	P[0] = vLinePt[0] - vSphereCenter[0];
	P[1] = vLinePt[1] - vSphereCenter[1];
	P[2] = vLinePt[2] - vSphereCenter[2];
	
	a = vLineDir.Dot(vLineDir);
	b = 2.0f * P.Dot(vLineDir);
	c = P.Dot(P) - (fSphereRadius * fSphereRadius);

	insideSqr = b*b - 4*a*c;
	if(insideSqr <= 0.000001f)
		return 0;

	// Ok, two solutions.
	sqr = (float)sqrt(insideSqr);
	
	*fIntersection1 = (-b + sqr) / (2.0f * a);
	*fIntersection2 = (-b - sqr) / (2.0f * a);

	return 1;
}


//-----------------------------------------------------------------------------
// Purpose: Generates Euler angles given a left-handed orientation matrix. The
//			columns of the matrix contain the forward, left, and up vectors.
// Input  : matrix - Left-handed orientation matrix.
//			angles[PITCH, YAW, ROLL]. Receives right-handed counterclockwise
//				rotations in degrees around Y, Z, and X respectively.
//-----------------------------------------------------------------------------

void MatrixAngles( const matrix3x4_t& matrix, RadianEuler &angles, Vector &position )
{
	MatrixGetColumn( matrix, 3, position );
	MatrixAngles( matrix, angles );
}

void MatrixAngles( const matrix3x4_t &matrix, Quaternion &q, Vector &pos )
{
#ifdef _VPROF_MATHLIB
	VPROF_BUDGET( "MatrixQuaternion", "Mathlib" );
#endif
	float trace;
	trace = matrix[0][0] + matrix[1][1] + matrix[2][2] + 1.0f;
	if( trace > 1.0f + FLT_EPSILON ) 
	{
		// VPROF_INCREMENT_COUNTER("MatrixQuaternion A",1);
		q.x = ( matrix[2][1] - matrix[1][2] );
		q.y = ( matrix[0][2] - matrix[2][0] );
		q.z = ( matrix[1][0] - matrix[0][1] );
		q.w = trace;
	} 
	else if ( matrix[0][0] > matrix[1][1] && matrix[0][0] > matrix[2][2] ) 
	{
		// VPROF_INCREMENT_COUNTER("MatrixQuaternion B",1);
		trace = 1.0f + matrix[0][0] - matrix[1][1] - matrix[2][2];
		q.x = trace;
		q.y = (matrix[1][0] + matrix[0][1] );
		q.z = (matrix[0][2] + matrix[2][0] );
		q.w = (matrix[2][1] - matrix[1][2] );
	} 
	else if (matrix[1][1] > matrix[2][2])
	{
		// VPROF_INCREMENT_COUNTER("MatrixQuaternion C",1);
		trace = 1.0f + matrix[1][1] - matrix[0][0] - matrix[2][2];
		q.x = (matrix[0][1] + matrix[1][0] );
		q.y = trace;
		q.z = (matrix[2][1] + matrix[1][2] );
		q.w = (matrix[0][2] - matrix[2][0] );
	}
	else
	{
		// VPROF_INCREMENT_COUNTER("MatrixQuaternion D",1);
		trace = 1.0f + matrix[2][2] - matrix[0][0] - matrix[1][1];
		q.x = (matrix[0][2] + matrix[2][0] );
		q.y = (matrix[2][1] + matrix[1][2] );
		q.z = trace;
		q.w = (matrix[1][0] - matrix[0][1] );
	}

	QuaternionNormalize( q );

#if 0
	// check against the angle version
	RadianEuler ang;
	MatrixAngles( matrix, ang );
	Quaternion test;
	AngleQuaternion( ang, test );
	float d = QuaternionDotProduct( q, test );
	Assert( fabs(d) > 0.99 && fabs(d) < 1.01 );
#endif

	MatrixGetColumn( matrix, 3, pos );
}

void MatrixAngles( const matrix3x4_t& matrix, float *angles )
{ 
#ifdef _VPROF_MATHLIB
	VPROF_BUDGET( "MatrixAngles", "Mathlib" );
#endif
	Assert( s_bMathlibInitialized );
	float forward[3];
	float left[3];
	float up[3];

	//
	// Extract the basis vectors from the matrix. Since we only need the Z
	// component of the up vector, we don't get X and Y.
	//
	forward[0] = matrix[0][0];
	forward[1] = matrix[1][0];
	forward[2] = matrix[2][0];
	left[0] = matrix[0][1];
	left[1] = matrix[1][1];
	left[2] = matrix[2][1];
	up[2] = matrix[2][2];

	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
	
	// enough here to get angles?
	if ( xyDist > 0.001f )
	{
		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
		angles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );

		// The engine does pitch inverted from this, but we always end up negating it in the DLL
		// UNDONE: Fix the engine to make it consistent
		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );

		// (roll)	z = ATAN( left.z, up.z );
		angles[2] = RAD2DEG( atan2f( left[2], up[2] ) );
	}
	else	// forward is mostly Z, gimbal lock-
	{
		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
		angles[1] = RAD2DEG( atan2f( -left[0], left[1] ) );

		// The engine does pitch inverted from this, but we always end up negating it in the DLL
		// UNDONE: Fix the engine to make it consistent
		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );

		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
		angles[2] = 0;
	}
}

#if 0
/*
=============
DecomposeRotation
=============
*/
// What is this for and why is this different from MatrixAngle?
void DecomposeRotation( const matrix3x4_t& mat, float* out )
{
	Assert( s_bMathlibInitialized );
	float cp;
	qboolean fixYaw = 0;
	qboolean fixRoll = 0;

	// Start off in radians
	out[ PITCH ] = asin( -mat[ 0 ][ 2 ] );

	cp = cos( out[ PITCH ] );
	if( cp )
	{
		float cy = mat[ 0 ][ 0 ] / cp;
		float cr = mat[ 2 ][ 2 ] / cp;

		// fix xome rounding error
		if( cy > 1.0 ) cy = 1.0;
		else if( cy < -1.0 ) cy = -1.0;
		if( cr > 1.0 ) cr = 1.0;
		else if( cr < -1.0 ) cr = -1.0;

		out[ YAW ] = acos( cy );
		out[ ROLL ] = acos( cr );
	}
	else
	{
		out[ YAW ] = 0;
		out[ ROLL ] = acos( mat[ 1 ][ 1 ] );
	}

	// convert angles to degrees
	out[ PITCH ] *= 180.0 / M_PI;
	out[ YAW ] *= 180.0 / M_PI;
	out[ ROLL ] *= 180.0 / M_PI;

	// correct for quadrants
	fixYaw = mat[ 0 ][ 1 ] / cp > 0;
	fixRoll = ( mat[ 1 ][ 2 ] / -cp < 0 );

	if( fixYaw )
		out[ YAW ] = 360 - out[ YAW ];
	if( fixRoll )
		out[ ROLL ] = 360 - out[ ROLL ];

	// normalize
#if 0
	if( out[ PITCH ] < 0 )
		out[ PITCH ] += 360;
	if( out[ YAW ] < 0 )
		out[ YAW ] += 360;
	if( out[ ROLL ] < 0 )
		out[ ROLL ] += 360;
#endif
}
#endif


void VectorTransform (const float *in1, const matrix3x4_t& in2, float *out)
{
	Assert( s_bMathlibInitialized );
	Assert( in1 != out );
	out[0] = DotProduct(in1, in2[0]) + in2[0][3];
	out[1] = DotProduct(in1, in2[1]) + in2[1][3];
	out[2] = DotProduct(in1, in2[2]) + in2[2][3];
}

// SSE Version of VectorTransform
void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
{
	Assert( s_bMathlibInitialized );
	Assert( in1 != out1 );

#ifdef _WIN32
	__asm
	{
		mov eax, in1;
		mov ecx, in2;
		mov edx, out1;

		movss xmm0, [eax];
		mulss xmm0, [ecx];
		movss xmm1, [eax+4];
		mulss xmm1, [ecx+4];
		movss xmm2, [eax+8];
		mulss xmm2, [ecx+8];
		addss xmm0, xmm1;
		addss xmm0, xmm2;
		addss xmm0, [ecx+12]
 		movss [edx], xmm0;
		add ecx, 16;

		movss xmm0, [eax];
		mulss xmm0, [ecx];
		movss xmm1, [eax+4];
		mulss xmm1, [ecx+4];
		movss xmm2, [eax+8];
		mulss xmm2, [ecx+8];
		addss xmm0, xmm1;
		addss xmm0, xmm2;
		addss xmm0, [ecx+12]
		movss [edx+4], xmm0;
		add ecx, 16;

		movss xmm0, [eax];
		mulss xmm0, [ecx];
		movss xmm1, [eax+4];
		mulss xmm1, [ecx+4];
		movss xmm2, [eax+8];
		mulss xmm2, [ecx+8];
		addss xmm0, xmm1;
		addss xmm0, xmm2;
		addss xmm0, [ecx+12]
		movss [edx+8], xmm0;
	}
#elif _LINUX
#warning "VectorTransformSSE C implementation only"
	out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
	out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
	out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
#else
#error
#endif
}

void VectorITransform (const float *in1, const matrix3x4_t& in2, float *out)
{
	Assert( s_bMathlibInitialized );
	float in1t[3];

	in1t[0] = in1[0] - in2[0][3];
	in1t[1] = in1[1] - in2[1][3];
	in1t[2] = in1[2] - in2[2][3];

	out[0] = in1t[0] * in2[0][0] + in1t[1] * in2[1][0] + in1t[2] * in2[2][0];
	out[1] = in1t[0] * in2[0][1] + in1t[1] * in2[1][1] + in1t[2] * in2[2][1];
	out[2] = in1t[0] * in2[0][2] + in1t[1] * in2[1][2] + in1t[2] * in2[2][2];
}


void VectorRotate( const float *in1, const matrix3x4_t& in2, float *out )
{
	Assert( s_bMathlibInitialized );
	Assert( in1 != out );
	out[0] = DotProduct( in1, in2[0] );
	out[1] = DotProduct( in1, in2[1] );
	out[2] = DotProduct( in1, in2[2] );
}

void VectorRotate( const Vector &in1, const QAngle &in2, Vector &out )
{
	matrix3x4_t matRotate;
	AngleMatrix( in2, matRotate );
	VectorRotate( in1, matRotate, out );
}

void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
{
	Assert( s_bMathlibInitialized );
	Assert( in1 != out1 );

#ifdef _WIN32
	__asm
	{
		mov eax, in1;
		mov ecx, in2;
		mov edx, out1;

		movss xmm0, [eax];
		mulss xmm0, [ecx];
		movss xmm1, [eax+4];
		mulss xmm1, [ecx+4];
		movss xmm2, [eax+8];
		mulss xmm2, [ecx+8];
		addss xmm0, xmm1;
		addss xmm0, xmm2;
 		movss [edx], xmm0;
		add ecx, 16;

		movss xmm0, [eax];
		mulss xmm0, [ecx];
		movss xmm1, [eax+4];
		mulss xmm1, [ecx+4];
		movss xmm2, [eax+8];
		mulss xmm2, [ecx+8];
		addss xmm0, xmm1;
		addss xmm0, xmm2;
		movss [edx+4], xmm0;
		add ecx, 16;

		movss xmm0, [eax];
		mulss xmm0, [ecx];
		movss xmm1, [eax+4];
		mulss xmm1, [ecx+4];
		movss xmm2, [eax+8];
		mulss xmm2, [ecx+8];
		addss xmm0, xmm1;
		addss xmm0, xmm2;
		movss [edx+8], xmm0;
	}
#elif _LINUX
#warning "VectorRotateSSE C implementation only"
	out1[0] = DotProduct( in1, in2[0] );
	out1[1] = DotProduct( in1, in2[1] );
	out1[2] = DotProduct( in1, in2[2] );

#else
#error
#endif
}


//-----------------------------------------------------------------------------
// NOTE: This actually works, but we need to be sure it actually optimizes anything
// I've actually really only added it to make sure it's in source control
//-----------------------------------------------------------------------------
void TransformAndRotate( const Vector &srcPos, const Vector &srcNorm, 
							const matrix3x4_t& mat, Vector &pos, Vector &norm )
{
	const float *pMat = &mat[0][0];
	const float *pPos = &srcPos.x;
	const float *pNormal = &srcNorm.x;
	float *pPosOut = &pos.x;
	float *pNormalOut = &norm.x;

//	Vector pt, nt;
//	pt[0] = DotProduct(vert.m_vecPosition.Base(), mat[0]) + mat[0][3];
//	pt[1] = DotProduct(vert.m_vecPosition.Base(), mat[1]) + mat[1][3];
//	pt[2] = DotProduct(vert.m_vecPosition.Base(), mat[2]) + mat[2][3];
//	nt[0] = DotProduct( vert.m_vecNormal.Base(), mat[0] );
//	nt[1] = DotProduct( vert.m_vecNormal.Base(), mat[1] );
//	nt[2] = DotProduct( vert.m_vecNormal.Base(), mat[2] );

#ifdef _WIN32
	__asm
	{
		mov		eax, DWORD PTR [pMat]
		mov		edx, DWORD PTR [pPos]
		mov		ecx, DWORD PTR [pNormal]
		mov		esi, DWORD PTR [pNormalOut]
		mov		edi, DWORD PTR [pPosOut]

		fld DWORD PTR[eax + 3*4]	; m03
		fld DWORD PTR[eax + 0]		; m00		| m03
		fld DWORD PTR[edx + 0]		; pos.x		| m00		| m03
		fld DWORD PTR[ecx + 0]		; norm.x	| pos.x		| m00		| m03
		fld DWORD PTR[edx + 4]		; pos.y		| norm.x	| pos.x		| m00		| m03
		fld DWORD PTR[eax + 1*4]	; m01		| pos.y		| norm.x	| pos.x		| m00		| m03
		fld DWORD PTR[ecx + 4]		; norm.y	| m01		| pos.y		| norm.x	| pos.x		| m00		| m03

		fxch st(5)					; m00		| m01				| pos.y			| norm.x		| pos.x			| norm.y	| m03
		fmul st(4), st(0)			; m00		| m01				| pos.y			| norm.x 		| pos.x	* m00	| norm.y	| m03
		fmulp st(3), st(0)			; m01		| pos.y				| norm.x * m00	| pos.x	* m00	| norm.y		| m03
		fmul st(1), st(0)			; m01		| pos.y * m01		| norm.x * m00	| pos.x	* m00	| norm.y		| m03
		fmulp st(4), st(0)			; pos.y * m01	| norm.x * m00	| pos.x * m00	| norm.y * m01	| m03

		fld DWORD PTR[eax + 2*4]	; m02		| pos.y * m01		| norm.x * m00	| pos.x * m00		| norm.y * m01	| m03
		fld DWORD PTR[edx + 8]		; pos.z		| m02				| pos.y * m01		| norm.x * m00	| pos.x * m00		| norm.y * m01	| m03
		fld DWORD PTR[ecx + 8]		; norm.z	| pos.z					| m02				| pos.y * m01		| norm.x * m00	| pos.x * m00		| norm.y * m01	| m03

		fxch st(5)					; pos.x * m00	| pos.z							| m02						| pos.y * m01	| norm.x * m00	| norm.z				| norm.y * m01	| m03
		faddp st(3), st(0)			; pos.z			| m02							| pos.x * m00 + pos.y * m01	| norm.x * m00	| norm.z				| norm.y * m01	| m03
		fxch st(3)					; norm.x * m00	| m02							| pos.x * m00 + pos.y * m01	| pos.z			| norm.z				| norm.y * m01	| m03
		faddp st(5), st(0)			; m02			| pos.x * m00 + pos.y * m01		| pos.z						| norm.z		| norm.y * m01 + norm.x * m00		| m03
		fmul  st(2), st(0)			; m02			| pos.x * m00 + pos.y * m01		| pos.z * m02				| norm.z		| norm.y * m01 + norm.x * m00		| m03
		fmulp st(3), st(0)			; pos.x * m00 + pos.y * m01		| pos.z * m02	| norm.z * m02	| norm.y * m01 + norm.x * m00		| m03
		faddp st(4), st(0)			; pos.z * m02	| norm.z * m02	| norm.y * m01 + norm.x * m00		| pos.x * m00 + pos.y * m01	+ m03

		; NOTE: Need two more instructions before we can start using the .z stuff

		fld DWORD PTR[eax + 1*16 + 3*4]	; m13		| pos.z * m02	| norm.z * m02	| norm.y * m01 + norm.x * m00		| pos.x * m00 + pos.y * m01	+ m03
		fld DWORD PTR[eax + 1*16 + 0]	; m10		| m13		| pos.z * m02		| norm.z * m02	| norm.y * m01 + norm.x * m00	| pos.x * m00 + pos.y * m01	+ m03
		fld DWORD PTR[edx + 0]		; pos.x		| m10		| m13				| pos.z * m02	| norm.z * m02		| norm.y * m01 + norm.x * m00		| pos.x * m00 + pos.y * m01	+ m03
		fld DWORD PTR[ecx + 0]		; norm.x	| pos.x		| m10				| m13			| pos.z * m02		| norm.z * m02	| norm.y * m01 + norm.x * m00		| pos.x * m00 + pos.y * m01	+ m03

		fxch st(7)					; pos.x * m00 + pos.y * m01	+ m03 | pos.x		| m10 | m13 | pos.z * m02		| norm.z * m02	| norm.y * m01 + norm.x * m00		| norm.x
		faddp st(4), st(0)			; pos.x		| m10 | m13 | pos.x * m00 + pos.y * m01	+ pos.z * m02 + m03 	| norm.z * m02	| norm.y * m01 + norm.x * m00		| norm.x
		fxch st(5)					; norm.y * m01 + norm.x * m00		| m10 | m13 | pos.x * m00 + pos.y * m01	+ pos.z * m02 + m03 	| norm.z * m02	| pos.x		| norm.x
		faddp st(4), st(0)			; m10 | m13 | pos.x * m00 + pos.y * m01	+ pos.z * m02 + m03 	| norm.x * m00 + norm.y * m01 + norm.z * m02	| pos.x		| norm.x
		fmul st(4), st(0)			; m10 | m13 | pos.x * m00 + pos.y * m01	+ pos.z * m02 + m03 	| norm.x * m00 + norm.y * m01 + norm.z * m02	| pos.x	* m10	| norm.x
		fmulp st(5), st(0)			; m13 | pos.x * m00 + pos.y * m01	+ pos.z * m02 + m03 	| norm.x * m00 + norm.y * m01 + norm.z * m02	| pos.x	* m10	| norm.x * m10
		fxch st(2)					; norm.x * m00 + norm.y * m01 + norm.z * m02 | pos.x * m00 + pos.y * m01	+ pos.z * m02 + m03 	| m13	| pos.x	* m10	| norm.x * m10

		fstp DWORD PTR[esi]			; pos.x * m00 + pos.y * m01	+ pos.z * m02 + m03 	| m13	| pos.x	* m10	| norm.x * m10
		fstp DWORD PTR[edi]			; m13	| pos.x	* m10	| norm.x * m10

		fld DWORD PTR[eax + 1*16 + 1*4]				; m11	| m13	| pos.x	* m10	| norm.x * m10
		fld DWORD PTR[edx + 4]		; pos.y	| m11	| m13		| pos.x	* m10	| norm.x * m10
		fld DWORD PTR[ecx + 4]		; norm.y| pos.y	| m11		| m13	| pos.x	* m10	| norm.x * m10
		fld DWORD PTR[eax + 1*16 + 2*4]				; m12	| norm.y| pos.y		| m11	| m13	| pos.x	* m10	| norm.x * m10
		fld DWORD PTR[edx + 8]		; pos.z	| m12	| norm.y	| pos.y	| m11	| m13	| pos.x	* m10	| norm.x * m10

		fxch st(4)					; m11	| m12	| norm.y	| pos.y		| pos.z		| m13	| pos.x	* m10	| norm.x * m10
		fmul st(3), st(0)			; m11	| m12	| norm.y	| pos.y	* m11	| pos.z		| m13	| pos.x	* m10	| norm.x * m10
		fmulp st(2), st(0)			; m12	| norm.y * m11	| pos.y	* m11	| pos.z		| m13	| pos.x	* m10	| norm.x * m10

		fld DWORD PTR[ecx + 8]		; norm.z	| m12	| norm.y * m11	| pos.y	* m11	| pos.z		| m13	| pos.x	* m10	| norm.x * m10
		fxch st(1)					; m12	| norm.z	| norm.y * m11	| pos.y	* m11	| pos.z		| m13	| pos.x	* m10	| norm.x * m10
		fmul st(4), st(0)			; m12	| norm.z	| norm.y * m11	| pos.y	* m11	| pos.z * m12		| m13	| pos.x	* m10	| norm.x * m10
		fmulp st(1), st(0)			; norm.z * m12		| norm.y * m11	| pos.y	* m11	| pos.z * m12		| m13	| pos.x	* m10	| norm.x * m10

		fld DWORD PTR[eax + 2*16 + 3*4]				; m23	|	norm.z * m12	| norm.y * m11	| pos.y	* m11	| pos.z * m12		| m13	| pos.x	* m10	| norm.x * m10

		fxch st(6)					; pos.x	* m10	| norm.z * m12	| norm.y * m11	| pos.y	* m11	| pos.z * m12	| m13			| m23			| norm.x * m10
		faddp st(3), st(0)			; norm.z * m12	| norm.y * m11	| pos.x	* m10 + pos.y * m11	| pos.z * m12		| m13			| m23			| norm.x * m10
		fxch st(4)					; m13			| norm.y * m11	| pos.x	* m10 + pos.y * m11	| pos.z * m12		| norm.z * m12	| m23			| norm.x * m10
		faddp st(3), st(0)			; norm.y * m11	| pos.x	* m10 + pos.y * m11	| pos.z * m12 + m13	| norm.z * m12	| m23			| norm.x * m10
		faddp st(5), st(0)			; pos.x	* m10 + pos.y * m11	| pos.z * m12 + m13	| norm.z * m12	| m23			| norm.x * m10 + norm.y * m11

		fld DWORD PTR[eax + 2*16 + 0]				; m20		| pos.x	* m10 + pos.y * m11	| pos.z * m12 + m13	| norm.z * m12	| m23			| norm.x * m10 + norm.y * m11
		fld DWORD PTR[edx + 0]		; pos.x		| m20		| pos.x	* m10 + pos.y * m11	| pos.z * m12 + m13	| norm.z * m12	| m23			| norm.x * m10 + norm.y * m11
		fld DWORD PTR[ecx + 0]		; norm.x	| pos.x		| m20		| pos.x	* m10 + pos.y * m11	| pos.z * m12 + m13	| norm.z * m12	| m23			| norm.x * m10 + norm.y * m11

		fxch st(3)					; pos.x	* m10 + pos.y * m11	| pos.x		| m20		| norm.x	| pos.z * m12 + m13	| norm.z * m12	| m23			| norm.x * m10 + norm.y * m11
		faddp st(4), st(0)			; pos.x		| m20		| norm.x	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| norm.z * m12	| m23			| norm.x * m10 + norm.y * m11
		fxch st(4)					; norm.z * m12	| m20		| norm.x	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.x		| m23			| norm.x * m10 + norm.y * m11
		faddp st(6), st(0)			; m20		| norm.x	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.x		| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12
		fmul st(3), st(0)			; m20		| norm.x	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.x	* m20	| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12
		fmulp st(1), st(0)			; norm.x * m20	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.x	* m20	| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12

		fld DWORD PTR[eax + 2*16 + 1*4]		; m21		| norm.x * m20	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.x	* m20	| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12
		fld DWORD PTR[edx + 4]		; pos.y		| m21		| norm.x * m20	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.x	* m20	| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12
		fld DWORD PTR[ecx + 4]		; norm.y	| pos.y		| m21		| norm.x * m20	| pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.x	* m20	| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12

		fxch st(4)					; pos.x	* m10 + pos.y * m11	+ pos.z * m12 + m13	| pos.y		| m21		| norm.x * m20	| norm.y	| pos.x	* m20	| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12
		fstp DWORD PTR[edi + 4]		; pos.y		| m21		| norm.x * m20	| norm.y	| pos.x	* m20	| m23			| norm.x * m10 + norm.y * m11 + norm.z * m12
		fxch st(6)					; norm.x * m10 + norm.y * m11 + norm.z * m12		| m21		| norm.x * m20	| norm.y	| pos.x	* m20	| m23			| pos.y
		fstp DWORD PTR[esi + 4]		; m21		| norm.x * m20	| norm.y	| pos.x	* m20	| m23			| pos.y

		fmul st(5), st(0)			; m21			| norm.x * m20	| norm.y		| pos.x	* m20	| m23			| pos.y	* m21
		fmulp st(2), st(0)			; norm.x * m20	| norm.y * m21	| pos.x	* m20	| m23			| pos.y	* m21

		fld DWORD PTR[eax + 2*16 + 2*4]	; m22			| norm.x * m20	| norm.y * m21	| pos.x	* m20	| m23			| pos.y	* m21
		fld DWORD PTR[edx + 8]		; pos.z			| m22			| norm.x * m20	| norm.y * m21	| pos.x	* m20	| m23			| pos.y	* m21
		fld DWORD PTR[ecx + 8]		; norm.z		| pos.z			| m22			| norm.x * m20	| norm.y * m21	| pos.x	* m20	| m23			| pos.y	* m21

		fxch st(5) 					; pos.x	* m20	| pos.z			| m22			| norm.x * m20	| norm.y * m21	| norm.z		| m23			| pos.y	* m21
		faddp st(6), st(0)			; pos.z			| m22			| norm.x * m20	| norm.y * m21	| norm.z		| pos.x	* m20 + m23	| pos.y	* m21
		fxch st(2)					; norm.x * m20 	| m22			| pos.z			| norm.y * m21	| norm.z		| pos.x	* m20 + m23	| pos.y	* m21
		faddp st(3), st(0)			; m22			| pos.z			| norm.x * m20 + norm.y * m21	| norm.z		| pos.x	* m20 + m23	| pos.y	* m21
		fmul st(1), st(0)			; m22			| pos.z * m22	| norm.x * m20 + norm.y * m21	| norm.z		| pos.x	* m20 + m23	| pos.y	* m21
		fmulp st(3), st(0)			; pos.z * m22	| norm.x * m20 + norm.y * m21	| norm.z * m22	| pos.x	* m20 + m23	| pos.y	* m21
		fxch st(4)					; pos.y * m21	| norm.x * m20 + norm.y * m21	| norm.z * m22	| pos.x	* m20 + m23	| pos.z	* m22
		faddp st(3), st(0)			; norm.x * m20 + norm.y * m21	| norm.z * m22	| pos.x	* m20 + pos.y * m21 + m23	| pos.z	* m22

		; STALLS AHOY!!!

		faddp st(1), st(0)			; norm.x * m20 + norm.y * m21 + norm.z * m22	| pos.x	* m20 + pos.y * m21 + m23	| pos.z	* m22
		fxch st(2)					; pos.z	* m22									| pos.x	* m20 + pos.y * m21 + m23	| norm.x * m20 + norm.y * m21 + norm.z * m22 
		faddp st(1), st(0)			; pos.x	* m20 + pos.y * m21 + pos.z	* m22 + m23	| norm.x * m20 + norm.y * m21 + norm.z * m22 
		fxch st(1)					; norm.x * m20 + norm.y * m21 + norm.z * m22	| pos.x	* m20 + pos.y * m21 + pos.z	* m22 + m23
		fstp DWORD PTR[esi + 8]		; pos.x	* m20 + pos.y * m21 + pos.z	* m22 + m23
		fstp DWORD PTR[edi + 8]
	}
#elif _LINUX
#warning "TransformAndRotate C implementation only"
	VectorTransform( pPos, mat, pPosOut);
	VectorRotate( pPosOut, mat, pPosOut);
	norm=pos;
	VectorNormalize(norm);
#else
#error

#endif

}

// rotate by the inverse of the matrix
void VectorIRotate( const float *in1, const matrix3x4_t& in2, float *out )
{
	Assert( s_bMathlibInitialized );
	Assert( in1 != out );
	out[0] = in1[0]*in2[0][0] + in1[1]*in2[1][0] + in1[2]*in2[2][0];
	out[1] = in1[0]*in2[0][1] + in1[1]*in2[1][1] + in1[2]*in2[2][1];
	out[2] = in1[0]*in2[0][2] + in1[1]*in2[1][2] + in1[2]*in2[2][2];
}

#ifndef VECTOR_NO_SLOW_OPERATIONS

QAngle TransformAnglesToLocalSpace( const QAngle &angles, const matrix3x4_t &parentMatrix )
{
	matrix3x4_t angToWorld, worldToParent, localMatrix;
	MatrixInvert( parentMatrix, worldToParent );
	AngleMatrix( angles, angToWorld );
	ConcatTransforms( worldToParent, angToWorld, localMatrix );
	
	QAngle out;
	MatrixAngles( localMatrix, out );
	return out;
}

QAngle TransformAnglesToWorldSpace( const QAngle &angles, const matrix3x4_t &parentMatrix )
{
	matrix3x4_t angToParent, angToWorld;
	AngleMatrix( angles, angToParent );
	ConcatTransforms( parentMatrix, angToParent, angToWorld );
	QAngle out;
	MatrixAngles( angToWorld, out );
	return out;
}

#endif

void MatrixInitialize( matrix3x4_t &mat, const Vector &vecOrigin, const Vector &vecXAxis, const Vector &vecYAxis, const Vector &vecZAxis )
{
	MatrixSetColumn( vecXAxis, 0, mat );
	MatrixSetColumn( vecYAxis, 1, mat );
	MatrixSetColumn( vecZAxis, 2, mat );
	MatrixSetColumn( vecOrigin, 3, mat );
}

void MatrixCopy( const matrix3x4_t& in, matrix3x4_t& out )
{
	Assert( s_bMathlibInitialized );
	memcpy( out.Base(), in.Base(), sizeof( float ) * 3 * 4 );
}

void MatrixInvert( const matrix3x4_t& in, matrix3x4_t& out )
{
	Assert( s_bMathlibInitialized );
	if ( &in == &out )
	{
		swap(out[0][1],out[1][0]);
		swap(out[0][2],out[2][0]);
		swap(out[1][2],out[2][1]);
	}
	else
	{
		// I'm guessing this only works on a 3x4 orthonormal matrix
		out[0][0] = in[0][0];
		out[0][1] = in[1][0];
		out[0][2] = in[2][0];

		out[1][0] = in[0][1];
		out[1][1] = in[1][1];
		out[1][2] = in[2][1];

		out[2][0] = in[0][2];
		out[2][1] = in[1][2];
		out[2][2] = in[2][2];
	}

	float tmp[3];
	tmp[0] = in[0][3];
	tmp[1] = in[1][3];
	tmp[2] = in[2][3];

	out[0][3] = -DotProduct( tmp, out[0] );
	out[1][3] = -DotProduct( tmp, out[1] );
	out[2][3] = -DotProduct( tmp, out[2] );
}

void MatrixGetColumn( const matrix3x4_t& in, int column, Vector &out )
{
	out.x = in[0][column];
	out.y = in[1][column];
	out.z = in[2][column];
}

void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
{
	out[0][column] = in.x;
	out[1][column] = in.y;
	out[2][column] = in.z;
}


int VectorCompare (const float *v1, const float *v2)
{
	Assert( s_bMathlibInitialized );
	int		i;
	
	for (i=0 ; i<3 ; i++)
		if (v1[i] != v2[i])
			return 0;
			
	return 1;
}

#if _WIN32
_declspec(naked)
#ifndef PFN_VECTORMA
void VectorMA( const float *start, float scale, const float *direction, float *dest )
#else
void _VectorMA( const float *start, float scale, const float *direction, float *dest )
#endif
{
	Assert( s_bMathlibInitialized );
	_asm {
		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
		mov edx, DWORD PTR [esp+0x10]	; *dest
		fld DWORD PTR [esp+0x08]		; sc
		fld	DWORD PTR [eax]				; s0		| sc
		fld DWORD PTR [ecx]				; d0		| s0	| sc
		fld DWORD PTR [eax+4]			; s1		| d0	| s0	| sc
		fld DWORD PTR [ecx+4]			; d1		| s1	| d0	| s0	| sc
		fld DWORD PTR [eax+8]			; s2		| d1	| s1	| d0	| s0	| sc
		fxch st(4)						; s0		| d1	| s1	| d0	| s2	| sc
		fld DWORD PTR [ecx+8]			; d2		| s0	| d1	| s1	| d0	| s2	| sc
		fxch st(4)						; d0		| s0	| d1	| s1	| d2	| s2	| sc
		fmul st,st(6)					; d0*sc		| s0	| d1	| s1	| d2	| s2	| sc
		fxch st(2)						; d1		| s0	| d0*sc	| s1	| d2	| s2	| sc
		fmul st,st(6)					; d1*sc		| s0	| d0*sc	| s1	| d2	| s2	| sc
		fxch st(4)						; d2 		| s0	| d0*sc	| s1	| d1*sc | s2	| sc
		fmulp st(6),st					; s0		| d0*sc	| s1	| d1*sc	| s2	| d2*sc
		faddp st(1),st					; s0+d0*sc	| s1	| d1*sc	| s2	| d2*sc
		fstp DWORD PTR [edx]			; s1		| d1*sc	| s2	| d2*sc
		faddp st(1),st					; s1+d1*sc	| s2	| d2*sc
		fstp DWORD PTR [edx+4]			; s2		| d2*sc
		faddp st(1),st					; s2+d2*sc
		fstp DWORD PTR [edx+8]			; [Empty stack]
		ret
	}
}
#else
#ifndef PFN_VECTORMA
void VectorMA( const float *start, float scale, const float *direction, float *dest )
#else
void _VectorMA( const float *start, float scale, const float *direction, float *dest )
#endif
{
	Assert( s_bMathlibInitialized );
	dest[0] = start[0] + scale*direction[0];
	dest[1] = start[1] + scale*direction[1];
	dest[2] = start[2] + scale*direction[2];
}
#endif

#ifdef _WIN32
void 
_declspec(naked)
_SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
{
	// FIXME: This don't work!! It will overwrite memory in the write to dest
	Assert(0);

	Assert( s_bMathlibInitialized );
	_asm {  // Intel SSE only routine
		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
		mov edx, DWORD PTR [esp+0x10]	; *dest
		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
#ifdef ALIGNED_VECTOR
		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
		mulps	xmm3, xmm2				; x3 *= x2
		addps	xmm3, xmm1				; x3 += x1
		movaps	[edx], xmm3				; *dest = x3
#else
		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
		mulps	xmm3, xmm2				; x3 *= x2
		addps	xmm3, xmm1				; x3 += x1
		movups	[edx], xmm3				; *dest = x3
#endif
	}
}
#endif

#ifdef OLD_DOTPRODUCT

_declspec(naked)
vec_t __cdecl DotProduct (const vec_t *a, const vec_t *c)
{
	Assert( s_bMathlibInitialized );
	_asm {
		mov	eax, DWORD PTR [esp+4]	; *a
		mov ecx, DWORD PTR [esp+8]	; *c
		fld DWORD PTR [eax]				; a0
		fmul DWORD PTR [ecx]			; a0*c0
		fld DWORD PTR [eax+4]			; a1	| a0*c0
		fmul DWORD PTR [ecx+4]			; a1*c1	| a0*c0
		fld DWORD PTR [eax+8]			; a2	| a1*c1 | a0*c0
		fmul DWORD PTR [ecx+8]			; a2*c2	| a1*c1 | a0*c0
		fxch st(2)						; a0*c0 | a1*c1 | a2*c2
		faddp st(1), st					; a1*c1+a0*c0	| a2*c2
		faddp st(1), st					; a2*c2+a0*c0+a1*c1
		ret
	}
}

#endif


// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
//   Should be validated too!  :)
//   NJS: (Nov 1 2002) -NOT- faster.  may time a couple cycles faster in a single function like 
//   this, but when inlined, and instruction scheduled, the C version is faster.  
//   Verified this via VTune
   
/*
vec_t DotProduct (const vec_t *a, const vec_t *c)
{
	vec_t temp;

	__asm
	{
		mov eax, a;
		mov ecx, c;
		mov edx, DWORD PTR [temp]
		movss xmm0, [eax];
		mulss xmm0, [ecx];
		movss xmm1, [eax+4];
		mulss xmm1, [ecx+4];
		movss xmm2, [eax+8];
		mulss xmm2, [ecx+8];
		addss xmm0, xmm1;
		addss xmm0, xmm2;
		movss [edx], xmm0;
		fld DWORD PTR [edx];
		ret
	}
}
*/

void CrossProduct (const float* v1, const float* v2, float* cross)
{
	Assert( s_bMathlibInitialized );
	Assert( v1 != cross );
	Assert( v2 != cross );
	cross[0] = v1[1]*v2[2] - v1[2]*v2[1];
	cross[1] = v1[2]*v2[0] - v1[0]*v2[2];
	cross[2] = v1[0]*v2[1] - v1[1]*v2[0];
}







int Q_log2(int val)
{
	int answer=0;
	while (val>>=1)
		answer++;
	return answer;
}

void VectorMatrix( const Vector &forward, matrix3x4_t& matrix)
{
	Assert( s_bMathlibInitialized );
	Vector right, left, up, tmp;

	if (forward[0] == 0 && forward[1] == 0)
	{
		// pitch 90 degrees up/down from identity
		left[0] = 0;	
		left[1] = 1; 
		left[2] = 0;
		up[0] = -forward[2]; // NOTE: This should be 1.0f or -1.0f, Assert?
		up[1] = 0; 
		up[2] = 0;
	}
	else
	{
		tmp[0] = 0; tmp[1] = 0; tmp[2] = 1.0;
		CrossProduct( tmp, forward, left );
		VectorNormalize( left );
		CrossProduct( forward, left, up );
		VectorNormalize( up );
	}
	MatrixSetColumn( forward, 0, matrix );
	MatrixSetColumn( left, 1, matrix );
	MatrixSetColumn( up, 2, matrix );
}


void VectorVectors( const Vector &forward, Vector &right, Vector &up )
{
	Assert( s_bMathlibInitialized );
	Vector tmp;

	if (forward[0] == 0 && forward[1] == 0)
	{
		// pitch 90 degrees up/down from identity
		right[0] = 0;	
		right[1] = -1; 
		right[2] = 0;
		up[0] = -forward[2]; 
		up[1] = 0; 
		up[2] = 0;
	}
	else
	{
		tmp[0] = 0; tmp[1] = 0; tmp[2] = 1.0;
		CrossProduct( forward, tmp, right );
		VectorNormalize( right );
		CrossProduct( right, forward, up );
		VectorNormalize( up );
	}
}

void VectorAngles( const float *forward, float *angles )
{
	Assert( s_bMathlibInitialized );
	float	tmp, yaw, pitch;
	
	if (forward[1] == 0 && forward[0] == 0)
	{
		yaw = 0;
		if (forward[2] > 0)
			pitch = 270;
		else
			pitch = 90;
	}
	else
	{
		yaw = (atan2(forward[1], forward[0]) * 180 / M_PI);
		if (yaw < 0)
			yaw += 360;

		tmp = sqrt (forward[0]*forward[0] + forward[1]*forward[1]);
		pitch = (atan2(-forward[2], tmp) * 180 / M_PI);
		if (pitch < 0)
			pitch += 360;
	}
	
	angles[0] = pitch;
	angles[1] = yaw;
	angles[2] = 0;
}


/*
================
R_ConcatRotations
================
*/
void ConcatRotations (const float in1[3][3], const float in2[3][3], float out[3][3])
{
	Assert( s_bMathlibInitialized );
	Assert( in1 != out );
	Assert( in2 != out );
	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] +
				in1[0][2] * in2[2][0];
	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] +
				in1[0][2] * in2[2][1];
	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] +
				in1[0][2] * in2[2][2];
	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] +
				in1[1][2] * in2[2][0];
	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] +
				in1[1][2] * in2[2][1];
	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] +
				in1[1][2] * in2[2][2];
	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] +
				in1[2][2] * in2[2][0];
	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] +
				in1[2][2] * in2[2][1];
	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] +
				in1[2][2] * in2[2][2];
}


/*
================
R_ConcatTransforms
================
*/
void ConcatTransforms (const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out)
{
	Assert( s_bMathlibInitialized );
	if ( &in1 == &out )
	{
		matrix3x4_t in1b;
		MatrixCopy( in1, in1b );
		ConcatTransforms( in1b, in2, out );
		return;
	}
	if ( &in2 == &out )
	{
		matrix3x4_t in2b;
		MatrixCopy( in2, in2b );
		ConcatTransforms( in1, in2b, out );
		return;
	}
	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] +
				in1[0][2] * in2[2][0];
	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] +
				in1[0][2] * in2[2][1];
	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] +
				in1[0][2] * in2[2][2];
	out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] +
				in1[0][2] * in2[2][3] + in1[0][3];
	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] +
				in1[1][2] * in2[2][0];
	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] +
				in1[1][2] * in2[2][1];
	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] +
				in1[1][2] * in2[2][2];
	out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] +
				in1[1][2] * in2[2][3] + in1[1][3];
	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] +
				in1[2][2] * in2[2][0];
	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] +
				in1[2][2] * in2[2][1];
	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] +
				in1[2][2] * in2[2][2];
	out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] +
				in1[2][2] * in2[2][3] + in1[2][3];
}


/*
===================
FloorDivMod

Returns mathematically correct (floor-based) quotient and remainder for
numer and denom, both of which should contain no fractional part. The
quotient must fit in 32 bits.
====================
*/

void FloorDivMod (double numer, double denom, int *quotient,
		int *rem)
{
	Assert( s_bMathlibInitialized );
	int		q, r;
	double	x;

#ifdef PARANOID
	if (denom <= 0.0)
		Sys_Error ("FloorDivMod: bad denominator %d\n", denom);

//	if ((floor(numer) != numer) || (floor(denom) != denom))
//		Sys_Error ("FloorDivMod: non-integer numer or denom %f %f\n",
//				numer, denom);
#endif

	if (numer >= 0.0)
	{

		x = floor(numer / denom);
		q = (int)x;
		r = Floor2Int(numer - (x * denom));
	}
	else
	{
	//
	// perform operations with positive values, and fix mod to make floor-based
	//
		x = floor(-numer / denom);
		q = -(int)x;
		r = Floor2Int(-numer - (x * denom));
		if (r != 0)
		{
			q--;
			r = (int)denom - r;
		}
	}

	*quotient = q;
	*rem = r;
}


/*
===================
GreatestCommonDivisor
====================
*/
int GreatestCommonDivisor (int i1, int i2)
{
	Assert( s_bMathlibInitialized );
	if (i1 > i2)
	{
		if (i2 == 0)
			return (i1);
		return GreatestCommonDivisor (i2, i1 % i2);
	}
	else
	{
		if (i1 == 0)
			return (i2);
		return GreatestCommonDivisor (i1, i2 % i1);
	}
}


bool IsDenormal( const float &val )
{
	const int x = *reinterpret_cast <const int *> (&val); // needs 32-bit int
	const int abs_mantissa = x & 0x007FFFFF;
	const int biased_exponent = x & 0x7F800000;
	
	return  ( biased_exponent == 0 && abs_mantissa != 0 );
}


// ************************************** SPECIAL NOTE *******************************************************
// IFF you find this code to be PROBLEMATIC, PLEASE "#if 0" this and send email to "tkback@valvesoftware.com
// Note what machine this happened on and BUILD a debug version of the binary and see if it catches the problem.
// ***********************************************************************************************************

#if 0

#ifdef __cplusplus
extern "C" {
#endif

// This code is an TEST OPTIMIZATION for float -> long.  DISABLE this (use MSVCRT ver) if it causes problems.
// 
 
// WARNING!!!  This is an experimental replacement for the compiler intrinsic "_ftol()'
// This code is called whenever there is a float to int conversion in c code.
// Try this 80x87 specific FPU conversion trick!  (This normally does a ROUND during FISTP by default)
// If positive, we subtract 0.5 before doing a round() which equals floor() for positive values
// If negative, we add 0.5 before doing a round() which equals ceil() for negative values.
// Together, this simulates "chop to zero" even when the FPU is in normal rounding mode.

// The following two values should be "cache line aligned" ie. align 16.
const unsigned long FPU_HALF    = 0x3EFFFFFFL;
const unsigned long	FPU_MAGIC	= 0x59C00000L;

long int
__declspec( naked ) 
__cdecl _ftol()
{
#ifdef _DEBUG

// Turn off "Debug Multitreaded C Runtime Lib use in release mode" to avoid this.(why debug?)
//#ifdef NDEBUG
//#error Both _DEBUG and NDEBUG set!
//#endif

	Assert( s_bMathlibInitialized );
	// Make sure that we are in 64-bit precision(53-bit mantissa) FPU mode. _ftol() needs 64bit.
	_asm {
		fnstcw	word ptr [esp-4]
		and		word ptr [esp-4], 0x0300
		cmp		word ptr [esp-4], 0x0200	; Precision bits.  Must be 0x0200
		je		short good_precision
		int		3				; WRONG Precision set on FPU!!!  s/b >32bit Internally.
good_precision:
	}
#endif

	// Start the float to long conversion
	_asm {								; farg
		ftst
		fnstsw ax
		sahf
		jbe short negative

		fsub DWORD PTR [FPU_HALF]		; Subtract 0.5 from positive float, this changes round() to floor()!
		fadd DWORD PTR [FPU_MAGIC]		; Adjust the mantissa to have the "int" in the low bits.
		fstp QWORD PTR [esp-0Ch]		; Write out the floating point value in 64 bit double format
		mov	eax, DWORD PTR [esp-0Ch]	; Grab the low 32 bits of the mantissa which have our "int".
		ret

negative:
		fadd DWORD PTR [FPU_HALF]		; Add 0.5 to Negative float, this changes round() to floor()!
		fadd DWORD PTR [FPU_MAGIC]		; Adjust the mantissa to have the "int" in the low bits.
		fstp QWORD PTR [esp-0Ch]		; Write out the floating point value in 64 bit double format
		mov	eax, DWORD PTR [esp-0Ch]	; Grab the low 32 bits of the mantissa which have our "int".
		ret

	}
}

#ifdef __cplusplus
}
#endif

#endif

int SignbitsForPlane (cplane_t *out)
{
	Assert( s_bMathlibInitialized );
	int	bits, j;

	// for fast box on planeside test

	bits = 0;
	for (j=0 ; j<3 ; j++)
	{
		if (out->normal[j] < 0)
			bits |= 1<<j;
	}
	return bits;
}


/*
==================
BoxOnPlaneSide

Returns 1, 2, or 1 + 2
==================
*/
#if 1 || !id386 || defined _LINUX
int __cdecl BoxOnPlaneSide (const float *emins, const float *emaxs, const cplane_t *p)
{
	Assert( s_bMathlibInitialized );
	float	dist1, dist2;
	int		sides;

// fast axial cases
	if (p->type < 3)
	{
		if (p->dist <= emins[p->type])
			return 1;
		if (p->dist >= emaxs[p->type])
			return 2;
		return 3;
	}
	
// general case
	switch (p->signbits)
	{
	case 0:
dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
		break;
	case 1:
dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
		break;
	case 2:
dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
		break;
	case 3:
dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
		break;
	case 4:
dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
		break;
	case 5:
dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
		break;
	case 6:
dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
		break;
	case 7:
dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
		break;
	default:
		dist1 = dist2 = 0;		// shut up compiler
		Assert( 0 );
		break;
	}

	sides = 0;
	if (dist1 >= p->dist)
		sides = 1;
	if (dist2 < p->dist)
		sides |= 2;

	Assert( sides != 0 );

	return sides;
}
#else
#pragma warning( disable: 4035 )

__declspec( naked ) int __cdecl BoxOnPlaneSide (const float *emins, const float *emaxs, const cplane_t *p)
{
	Assert( s_bMathlibInitialized );
	static int bops_initialized;
	static int Ljmptab[8];

	__asm {

		push ebx
			
		cmp bops_initialized, 1
		je  initialized
		mov bops_initialized, 1
		
		mov Ljmptab[0*4], offset Lcase0
		mov Ljmptab[1*4], offset Lcase1
		mov Ljmptab[2*4], offset Lcase2
		mov Ljmptab[3*4], offset Lcase3
		mov Ljmptab[4*4], offset Lcase4
		mov Ljmptab[5*4], offset Lcase5
		mov Ljmptab[6*4], offset Lcase6
		mov Ljmptab[7*4], offset Lcase7
			
initialized:

		mov edx,ds:dword ptr[4+12+esp]
		mov ecx,ds:dword ptr[4+4+esp]
		xor eax,eax
		mov ebx,ds:dword ptr[4+8+esp]
		mov al,ds:byte ptr[17+edx]
		cmp al,8
		jge Lerror
		fld ds:dword ptr[0+edx]
		fld st(0)
		jmp dword ptr[Ljmptab+eax*4]
Lcase0:
		fmul ds:dword ptr[ebx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ebx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
		jmp LSetSides
Lcase1:
		fmul ds:dword ptr[ecx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ebx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
		jmp LSetSides
Lcase2:
		fmul ds:dword ptr[ebx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ecx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
		jmp LSetSides
Lcase3:
		fmul ds:dword ptr[ecx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ecx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
		jmp LSetSides
Lcase4:
		fmul ds:dword ptr[ebx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ebx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
		jmp LSetSides
Lcase5:
		fmul ds:dword ptr[ecx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ebx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
		jmp LSetSides
Lcase6:
		fmul ds:dword ptr[ebx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ecx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ecx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
		jmp LSetSides
Lcase7:
		fmul ds:dword ptr[ecx]
		fld ds:dword ptr[0+4+edx]
		fxch st(2)
		fmul ds:dword ptr[ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[4+ecx]
		fld ds:dword ptr[0+8+edx]
		fxch st(2)
		fmul ds:dword ptr[4+ebx]
		fxch st(2)
		fld st(0)
		fmul ds:dword ptr[8+ecx]
		fxch st(5)
		faddp st(3),st(0)
		fmul ds:dword ptr[8+ebx]
		fxch st(1)
		faddp st(3),st(0)
		fxch st(3)
		faddp st(2),st(0)
LSetSides:
		faddp st(2),st(0)
		fcomp ds:dword ptr[12+edx]
		xor ecx,ecx
		fnstsw ax
		fcomp ds:dword ptr[12+edx]
		and ah,1
		xor ah,1
		add cl,ah
		fnstsw ax
		and ah,1
		add ah,ah
		add cl,ah
		pop ebx
		mov eax,ecx
		ret
Lerror:
		int 3
	}
}
#pragma warning( default: 4035 )
#endif







//-----------------------------------------------------------------------------
// VectorMA routines
//-----------------------------------------------------------------------------

#if _WIN32
_declspec(naked)
	#if PFN_VECTORMA
void __cdecl _VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
    #else
void __cdecl VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
	#endif
{
	Assert( s_bMathlibInitialized );
	_asm {
		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
		mov edx, DWORD PTR [esp+0x10]	; *dest
		fld DWORD PTR [esp+0x08]		; sc
		fld	DWORD PTR [eax]				; s0		| sc
		fld DWORD PTR [ecx]				; d0		| s0	| sc
		fld DWORD PTR [eax+4]			; s1		| d0	| s0	| sc
		fld DWORD PTR [ecx+4]			; d1		| s1	| d0	| s0	| sc
		fld DWORD PTR [eax+8]			; s2		| d1	| s1	| d0	| s0	| sc
		fxch st(4)						; s0		| d1	| s1	| d0	| s2	| sc
		fld DWORD PTR [ecx+8]			; d2		| s0	| d1	| s1	| d0	| s2	| sc
		fxch st(4)						; d0		| s0	| d1	| s1	| d2	| s2	| sc
		fmul st,st(6)					; d0*sc		| s0	| d1	| s1	| d2	| s2	| sc
		fxch st(2)						; d1		| s0	| d0*sc	| s1	| d2	| s2	| sc
		fmul st,st(6)					; d1*sc		| s0	| d0*sc	| s1	| d2	| s2	| sc
		fxch st(4)						; d2 		| s0	| d0*sc	| s1	| d1*sc | s2	| sc
		fmulp st(6),st					; s0		| d0*sc	| s1	| d1*sc	| s2	| d2*sc
		faddp st(1),st					; s0+d0*sc	| s1	| d1*sc	| s2	| d2*sc
		fstp DWORD PTR [edx]			; s1		| d1*sc	| s2	| d2*sc
		faddp st(1),st					; s1+d1*sc	| s2	| d2*sc
		fstp DWORD PTR [edx+4]			; s2		| d2*sc
		faddp st(1),st					; s2+d2*sc
		fstp DWORD PTR [edx+8]			; [Empty stack]
		ret
	}
}
#else
	#if PFN_VECTORMA
void __cdecl _VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
    #else
void __cdecl VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
	#endif
{
	Assert( s_bMathlibInitialized );
	dest[0] = start[0] + scale*direction[0];
	dest[1] = start[1] + scale*direction[1];
	dest[2] = start[2] + scale*direction[2];
}
#endif

#ifdef _WIN32
#if PFN_VECTORMA
void 
_declspec(naked)
__cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
{
	// FIXME: This don't work!! It will overwrite memory in the write to dest
	Assert(0);
	Assert( s_bMathlibInitialized );
	_asm 
	{  
		// Intel SSE only routine
		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
		mov edx, DWORD PTR [esp+0x10]	; *dest
		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
#ifdef ALIGNED_VECTOR
		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
		mulps	xmm3, xmm2				; x3 *= x2
		addps	xmm3, xmm1				; x3 += x1
		movaps	[edx], xmm3				; *dest = x3
#else
		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
		mulps	xmm3, xmm2				; x3 *= x2
		addps	xmm3, xmm1				; x3 += x1
		movups	[edx], xmm3				; *dest = x3
#endif
	}
}

float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
#endif
#endif

//-----------------------------------------------------------------------------
// Euler QAngle -> Basis Vectors
//-----------------------------------------------------------------------------

void AngleVectors (const QAngle &angles, Vector *forward)
{
	Assert( s_bMathlibInitialized );
	Assert( forward );
	
	float	sp, sy, cp, cy;
	
	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
	
	forward->x = cp*cy;
	forward->y = cp*sy;
	forward->z = -sp;
}

void AngleVectors (const QAngle &angles, Vector *forward, Vector *right, Vector *up)
{
	Assert( s_bMathlibInitialized );
	float sr, sp, sy, cr, cp, cy;
	
	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );

	if (forward)
	{
		forward->x = cp*cy;
		forward->y = cp*sy;
		forward->z = -sp;
	}

	if (right)
	{
		right->x = (-1*sr*sp*cy+-1*cr*-sy);
		right->y = (-1*sr*sp*sy+-1*cr*cy);
		right->z = -1*sr*cp;
	}

	if (up)
	{
		up->x = (cr*sp*cy+-sr*-sy);
		up->y = (cr*sp*sy+-sr*cy);
		up->z = cr*cp;
	}
}

//-----------------------------------------------------------------------------
// Euler QAngle -> Basis Vectors transposed
//-----------------------------------------------------------------------------

void AngleVectorsTranspose (const QAngle &angles, Vector *forward, Vector *right, Vector *up)
{
	Assert( s_bMathlibInitialized );
	float sr, sp, sy, cr, cp, cy;
	
	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );

	if (forward)
	{
		forward->x	= cp*cy;
		forward->y	= (sr*sp*cy+cr*-sy);
		forward->z	= (cr*sp*cy+-sr*-sy);
	}
	
	if (right)
	{
		right->x	= cp*sy;
		right->y	= (sr*sp*sy+cr*cy);
		right->z	= (cr*sp*sy+-sr*cy);
	}

	if (up)
	{
		up->x		= -sp;
		up->y		= sr*cp;
		up->z		= cr*cp;
	}
}

//-----------------------------------------------------------------------------
// Forward direction vector -> Euler angles
//-----------------------------------------------------------------------------

void VectorAngles( const Vector& forward, QAngle &angles )
{
	Assert( s_bMathlibInitialized );
	float	tmp, yaw, pitch;
	
	if (forward[1] == 0 && forward[0] == 0)
	{
		yaw = 0;
		if (forward[2] > 0)
			pitch = 270;
		else
			pitch = 90;
	}
	else
	{
		yaw = (atan2(forward[1], forward[0]) * 180 / M_PI);
		if (yaw < 0)
			yaw += 360;

		tmp = FastSqrt (forward[0]*forward[0] + forward[1]*forward[1]);
		pitch = (atan2(-forward[2], tmp) * 180 / M_PI);
		if (pitch < 0)
			pitch += 360;
	}
	
	angles[0] = pitch;
	angles[1] = yaw;
	angles[2] = 0;
}

//-----------------------------------------------------------------------------
// Forward direction vector with a reference up vector -> Euler angles
//-----------------------------------------------------------------------------

void VectorAngles( const Vector &forward, const Vector &pseudoup, QAngle &angles )
{
	Assert( s_bMathlibInitialized );

	Vector left;

	CrossProduct( forward, pseudoup, left );
	VectorNormalizeFast( left );		
	
	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );

	// enough here to get angles?
	if ( xyDist > 0.001f )
	{
		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
		angles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );

		// The engine does pitch inverted from this, but we always end up negating it in the DLL
		// UNDONE: Fix the engine to make it consistent
		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );

		float up_z = (left[0] * forward[1]) - (left[1] * forward[0]);

		// (roll)	z = ATAN( left.z, up.z );
		angles[2] = RAD2DEG( atan2f( left[2], up_z ) );
	}
	else	// forward is mostly Z, gimbal lock-
	{
		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
		angles[1] = RAD2DEG( atan2f( left[0], -left[1] ) ); //This was originally copied from the "void MatrixAngles( const matrix3x4_t& matrix, float *angles )" code, and it's 180 degrees off, negated the values and it all works now (Dave Kircher)

		// The engine does pitch inverted from this, but we always end up negating it in the DLL
		// UNDONE: Fix the engine to make it consistent
		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );

		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
		angles[2] = 0;
	}	
}

void SetIdentityMatrix( matrix3x4_t& matrix )
{
	memset( matrix.Base(), 0, sizeof(float)*3*4 );
	matrix[0][0] = 1.0;
	matrix[1][1] = 1.0;
	matrix[2][2] = 1.0;
}

//-----------------------------------------------------------------------------
// Purpose: converts engine euler angles into a matrix
// Input  : vec3_t angles - PITCH, YAW, ROLL
// Output : *matrix - left-handed column matrix
//			the basis vectors for the rotations will be in the columns as follows:
//			matrix[][0] is forward
//			matrix[][1] is left
//			matrix[][2] is up
//-----------------------------------------------------------------------------
void AngleMatrix( RadianEuler const &angles, const Vector &position, matrix3x4_t& matrix )
{
	AngleMatrix( angles, matrix );
	MatrixSetColumn( position, 3, matrix );
}

void AngleMatrix( const RadianEuler& angles, matrix3x4_t& matrix )
{
	QAngle quakeEuler( RAD2DEG( angles.y ), RAD2DEG( angles.z ), RAD2DEG( angles.x ) );

	AngleMatrix( quakeEuler, matrix );
}


void AngleMatrix( const QAngle &angles, const Vector &position, matrix3x4_t& matrix )
{
	AngleMatrix( angles, matrix );
	MatrixSetColumn( position, 3, matrix );
}

void AngleMatrix( const QAngle &angles, matrix3x4_t& matrix )
{
#ifdef _VPROF_MATHLIB
	VPROF_BUDGET( "AngleMatrix", "Mathlib" );
#endif
	Assert( s_bMathlibInitialized );
	float		sr, sp, sy, cr, cp, cy;

	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );

	// matrix = (YAW * PITCH) * ROLL
	matrix[0][0] = cp*cy;
	matrix[1][0] = cp*sy;
	matrix[2][0] = -sp;
	matrix[0][1] = sr*sp*cy+cr*-sy;
	matrix[1][1] = sr*sp*sy+cr*cy;
	matrix[2][1] = sr*cp;
	matrix[0][2] = (cr*sp*cy+-sr*-sy);
	matrix[1][2] = (cr*sp*sy+-sr*cy);
	matrix[2][2] = cr*cp;
	matrix[0][3] = 0.f;
	matrix[1][3] = 0.f;
	matrix[2][3] = 0.f;
}

void AngleIMatrix( const RadianEuler& angles, matrix3x4_t& matrix )
{
	QAngle quakeEuler( RAD2DEG( angles.y ), RAD2DEG( angles.z ), RAD2DEG( angles.x ) );

	AngleIMatrix( quakeEuler, matrix );
}

void AngleIMatrix (const QAngle& angles, matrix3x4_t& matrix )
{
	Assert( s_bMathlibInitialized );
	float		sr, sp, sy, cr, cp, cy;
	
	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );

	// matrix = (YAW * PITCH) * ROLL
	matrix[0][0] = cp*cy;
	matrix[0][1] = cp*sy;
	matrix[0][2] = -sp;
	matrix[1][0] = sr*sp*cy+cr*-sy;
	matrix[1][1] = sr*sp*sy+cr*cy;
	matrix[1][2] = sr*cp;
	matrix[2][0] = (cr*sp*cy+-sr*-sy);
	matrix[2][1] = (cr*sp*sy+-sr*cy);
	matrix[2][2] = cr*cp;
	matrix[0][3] = 0.f;
	matrix[1][3] = 0.f;
	matrix[2][3] = 0.f;
}

void AngleIMatrix (const QAngle &angles, const Vector &position, matrix3x4_t &mat )
{
	AngleIMatrix( angles, mat );

	Vector vecTranslation;
	VectorRotate( position, mat, vecTranslation );
	vecTranslation *= -1.0f;
	MatrixSetColumn( vecTranslation, 3, mat );
}


//-----------------------------------------------------------------------------
// Intersects a sphere against an axis aligned box
//-----------------------------------------------------------------------------
bool SphereToAABBIntersection( const Vector& sphCenter, float sphRadius, 
			   				   const Vector& boxMin, const Vector& boxMax )
{
	Assert( s_bMathlibInitialized );
	int		i;
	float	s;
	float	dist = 0;

	for( i = 0; i < 3; i++ )
	{
		if( sphCenter[i] < boxMin[i] )
		{
			s = sphCenter[i] - boxMin[i];
			dist += s * s;
		}
		else if( sphCenter[i] > boxMax[i] )
		{
			s = sphCenter[i] - boxMax[i];
			dist += s * s;
		}
	}
	
	return ( dist <= ( sphRadius * sphRadius ) );
}


//-----------------------------------------------------------------------------
// Bounding box construction methods
//-----------------------------------------------------------------------------

void ClearBounds (Vector& mins, Vector& maxs)
{
	Assert( s_bMathlibInitialized );
	mins[0] = mins[1] = mins[2] = 99999;
	maxs[0] = maxs[1] = maxs[2] = -99999;
}

void AddPointToBounds (const Vector& v, Vector& mins, Vector& maxs)
{
	Assert( s_bMathlibInitialized );
	int		i;
	vec_t	val;

	for (i=0 ; i<3 ; i++)
	{
		val = v[i];
		if (val < mins[i])
			mins[i] = val;
		if (val > maxs[i])
			maxs[i] = val;
	}
}



//-----------------------------------------------------------------------------
// Gamma conversion support
//-----------------------------------------------------------------------------
static byte		texgammatable[256];	// palette is sent through this to convert to screen gamma

static float	texturetolinear[256];	// texture (0..255) to linear (0..1)
static int		lineartotexture[1024];	// linear (0..1) to texture (0..255)
static int		lineartoscreen[1024];	// linear (0..1) to gamma corrected vertex light (0..255)

// build a lightmap texture to combine with surface texture, adjust for src*dst+dst*src, ramp reprogramming, etc
float			lineartovertex[4096];	// linear (0..4) to screen corrected vertex space (0..1?)
unsigned char	lineartolightmap[4096];	// linear (0..4) to screen corrected texture value (0..255)

static float	g_Mathlib_GammaToLinear[256];	// gamma (0..1) to linear (0..1)
static float	g_Mathlib_LinearToGamma[256];	// linear (0..1) to gamma (0..1)

float	power2_n[256];			// 2**(index - 128)

void BuildExponentTable( )
{
	for( int i = 0; i < 256; i++ )
	{
		power2_n[i] = pow( 2.0f, i - 128 ) / 255.0f;
	}
}

void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright )
{
	int		i, inf;
	float	g1, g3;

	// Con_Printf("BuildGammaTable %.1f %.1f %.1f\n", g, v_lightgamma.GetFloat(), v_texgamma.GetFloat() );

	float g = gamma;
	if (g > 3.0) 
	{
		g = 3.0;
	}

	g = 1.0 / g;
	g1 = texGamma * g; 

	if (brightness <= 0.0) 
	{
		g3 = 0.125;
	}
	else if (brightness > 1.0) 
	{
		g3 = 0.05;
	}
	else 
	{
		g3 = 0.125 - (brightness * brightness) * 0.075;
	}

	for (i=0 ; i<256 ; i++)
	{
		inf = 255 * pow ( i/255.f, g1 ); 
		if (inf < 0)
			inf = 0;
		if (inf > 255)
			inf = 255;
		texgammatable[i] = inf;
	}

	for (i=0 ; i<1024 ; i++)
	{
		float f;

		f = i / 1023.0;

		// scale up
		if (brightness > 1.0)
			f = f * brightness;

		// shift up
		if (f <= g3)
			f = (f / g3) * 0.125;
		else 
			f = 0.125 + ((f - g3) / (1.0 - g3)) * 0.875;

		// convert linear space to desired gamma space
		inf = 255 * pow ( f, g ); 

		if (inf < 0)
			inf = 0;
		if (inf > 255)
			inf = 255;
		lineartoscreen[i] = inf;
	}

	/*
	for (i=0 ; i<1024 ; i++)
	{
		// convert from screen gamma space to linear space
		lineargammatable[i] = 1023 * pow ( i/1023.0, v_gamma.GetFloat() );
		// convert from linear gamma space to screen space
		screengammatable[i] = 1023 * pow ( i/1023.0, 1.0 / v_gamma.GetFloat() );
	}
	*/

	for (i=0 ; i<256 ; i++)
	{
		// convert from nonlinear texture space (0..255) to linear space (0..1)
		texturetolinear[i] =  pow( i / 255.f, texGamma );

		// convert from linear space (0..1) to nonlinear (sRGB) space (0..1)
		g_Mathlib_LinearToGamma[i] =  pow( i / 255.f, 1.0f / 2.2f );

		// convert from sRGB gamma space (0..1) to linear space (0..1)
		g_Mathlib_GammaToLinear[i] =  pow( i / 255.f, 2.2f );
	}

	for (i=0 ; i<1024 ; i++)
	{
		// convert from linear space (0..1) to nonlinear texture space (0..255)
		lineartotexture[i] =  pow( i / 1023.0, 1.0 / texGamma ) * 255;
	}

	BuildExponentTable();

#if 0
	for (i=0 ; i<256 ; i++)
	{
		float f;

		// convert from nonlinear lightmap space (0..255) to linear space (0..4)
		// f =  (i / 255.0) * sqrt( 4 );
		f =  i * (2.0 / 255.0);
		f = f * f;

		texlighttolinear[i] = f;
	}
#endif

	{
		float f;
		float overbrightFactor = 1.0f;

		// Can't do overbright without texcombine
		// UNDONE: Add GAMMA ramp to rectify this
		if ( overbright == 2 )
		{
			overbrightFactor = 0.5;
		}
		else if ( overbright == 4 )
		{
			overbrightFactor = 0.25;
		}

		for (i=0 ; i<4096 ; i++)
		{
			// convert from linear 0..4 (x1024) to screen corrected vertex space (0..1?)
			f = pow ( i/1024.0, 1.0 / gamma );

			lineartovertex[i] = f * overbrightFactor;
			if (lineartovertex[i] > 1)
				lineartovertex[i] = 1;

			int nLightmap = RoundFloatToInt( f * 255 * overbrightFactor );
			nLightmap = clamp( nLightmap, 0, 255 );
			lineartolightmap[i] = (unsigned char)nLightmap;
		}
	}
}

float GammaToLinearFullRange( float gamma )
{
	return pow( gamma, 2.2f );
}

float LinearToGammaFullRange( float linear )
{
	return pow( linear, 1.0f / 2.2f );
}

float GammaToLinear( float gamma )
{
	Assert( s_bMathlibInitialized );
//	return pow( gamma, 2.2f );
	if( gamma < 0.0f )
	{
		return 0.0f;
	}
	Assert( gamma <= 1.0f );
	if( gamma >= 0.95f)
	{
		// Use GammaToLinearFullRange maybe if you trip this.
		return 1.0f;
	}

	int index = RoundFloatToInt( gamma * 255.0f );
	Assert( index >= 0 && index < 256 );
	return g_Mathlib_GammaToLinear[index];
}

float LinearToGamma( float linear )
{
	Assert( s_bMathlibInitialized );
//	return pow( linear, 1.0f / 2.2f );
	if( linear < 0.0f )
	{
		return 0.0f;
	}
	if( linear > 1.0f )
	{
		// Use LinearToGammaFullRange maybe if you trip this.
		Assert( 0 );
		return 1.0f;
	}

	int index = RoundFloatToInt( linear * 255.0f );
	Assert( index >= 0 && index < 256 );
	return g_Mathlib_LinearToGamma[index];
}

// convert texture to linear 0..1 value
float TextureToLinear( int c )
{
	Assert( s_bMathlibInitialized );
	if (c < 0)
		return 0;
	if (c > 255)
		return 1.0;

	return texturetolinear[c];
}

// convert texture to linear 0..1 value
int LinearToTexture( float f )
{
	Assert( s_bMathlibInitialized );
	int i;
	i = f * 1023;	// assume 0..1 range
	if (i < 0)
		i = 0;
	if (i > 1023)
		i = 1023;

	return lineartotexture[i];
}


// converts 0..1 linear value to screen gamma (0..255)
int LinearToScreenGamma( float f )
{
	Assert( s_bMathlibInitialized );
	int i;
	i = f * 1023;	// assume 0..1 range
	if (i < 0)
		i = 0;
	if (i > 1023)
		i = 1023;

	return lineartoscreen[i];
}

// solve a x^2 + b x + c = 0
bool SolveQuadratic( float a, float b, float c, float &root1, float &root2 )
{
	Assert( s_bMathlibInitialized );
	if (a == 0)
	{
		if (b != 0)
		{
			// no x^2 component, it's a linear system
			root1 = root2 = -c / b;
			return true;
		}
		if (c == 0)
		{
			// all zero's
			root1 = root2 = 0;
			return true;
		}
		return false;
	}

	float tmp = b * b - 4.0f * a * c;

	if (tmp < 0)
	{
		// imaginary number, bah, no solution.
		return false;
	}

	tmp = sqrt( tmp );
	root1 = (-b + tmp) / (2.0f * a);
	root2 = (-b - tmp) / (2.0f * a);
	return true;
}



// solves for "a, b, c" where "a x^2 + b x + c = y", return true if solution exists
bool SolveInverseQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c )
{
	float det = (x1 - x2)*(x1 - x3)*(x2 - x3);

	// FIXME: check with some sort of epsilon
	if (det == 0.0)
		return false;

	a = (x3*(-y1 + y2) + x2*(y1 - y3) + x1*(-y2 + y3)) / det;

	b = (x3*x3*(y1 - y2) + x1*x1*(y2 - y3) + x2*x2*(-y1 + y3)) / det;

    c = (x1*x3*(-x1 + x3)*y2 + x2*x2*(x3*y1 - x1*y3) + x2*(-(x3*x3*y1) + x1*x1*y3)) / det;

	return true;
}

bool SolveInverseQuadraticMonotonic( float x1, float y1, float x2, float y2, float x3, float y3, 
									 float &a, float &b, float &c )
{
	// use SolveInverseQuadratic, but if the sigm of the derivative at the start point is the wrong
	// sign, displace the mid point
	
	// first, sort parameters
	if (x1>x2)
	{
		swap(x1,x2);
		swap(y1,y2);
	}
	if (x2>x3)
	{
		swap(x2,x3);
		swap(y2,y3);
	}
	if (x1>x2)
	{
		swap(x1,x2);
		swap(y1,y2);
	}
	// this code is not fast. what it does is when the curve would be non-monotonic, slowly shifts
	// the center point closer to the linear line between the endpoints. Should anyone need htis
	// function to be actually fast, it would be fairly easy to change it to be so.
	for(float blend_to_linear_factor=0.0;blend_to_linear_factor<=1.0;blend_to_linear_factor+=0.05)
	{
		float tempy2=(1-blend_to_linear_factor)*y2+blend_to_linear_factor*FLerp(y1,y3,x1,x3,x2);
		if (!SolveInverseQuadratic(x1,y1,x2,tempy2,x3,y3,a,b,c))
			return false;
		float derivative=2.0*a+b;
		if ( (y1<y2) && (y2<y3))							// monotonically increasing
		{
			if (derivative>=0.0)
				return true;
		}
		else
		{
			if ( (y1>y2) && (y2>y3))							// monotonically decreasing
			{
				if (derivative<=0.0)
					return true;
			}
			else
				return true;
		}
	}
	return true;
}


// solves for "a, b, c" where "1/(a x^2 + b x + c ) = y", return true if solution exists
bool SolveInverseReciprocalQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c )
{
	float det = (x1 - x2)*(x1 - x3)*(x2 - x3)*y1*y2*y3;

	// FIXME: check with some sort of epsilon
	if (det == 0.0)
		return false;

	a = (x1*y1*(y2 - y3) + x3*(y1 - y2)*y3 + x2*y2*(-y1 + y3)) / det;

	b = (x2*x2*y2*(y1 - y3) + x3*x3*(-y1 + y2)*y3 + x1*x1*y1*(-y2 + y3)) / det;

	c = (x2*(x2 - x3)*x3*y2*y3 + x1*x1*y1*(x2*y2 - x3*y3) + x1*(-(x2*x2*y1*y2) + x3*x3*y1*y3)) / det;

	return true;
}


// Rotate a vector around the Z axis (YAW)
void VectorYawRotate( const Vector &in, float flYaw, Vector &out)
{
	Assert( s_bMathlibInitialized );
	if (&in == &out )
	{
		Vector tmp;
		tmp = in;
		VectorYawRotate( tmp, flYaw, out );
		return;
	}

	float sy, cy;

	SinCos( DEG2RAD(flYaw), &sy, &cy );

	out.x = in.x * cy - in.y * sy;
	out.y = in.x * sy + in.y * cy;
	out.z = in.z;
}



float Bias( float x, float biasAmt )
{
	// WARNING: not thread safe
	static float lastAmt = -1;
	static float lastExponent = 0;
	if( lastAmt != biasAmt )
	{
		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
	}
	return pow( x, lastExponent );
}


float Gain( float x, float biasAmt )
{
	// WARNING: not thread safe
	if( x < 0.5 )
		return 0.5f * Bias( 2*x, 1-biasAmt );
	else
		return 1 - 0.5f * Bias( 2 - 2*x, 1-biasAmt );
}


float SmoothCurve( float x )
{
	return (1 - cos( x * M_PI )) * 0.5f;
}


inline float MovePeak( float x, float flPeakPos )
{
	// Todo: make this higher-order?
	if( x < flPeakPos )
		return x * 0.5f / flPeakPos;
	else
		return 0.5 + 0.5 * (x - flPeakPos) / (1 - flPeakPos);
}


float SmoothCurve_Tweak( float x, float flPeakPos, float flPeakSharpness )
{
	float flMovedPeak = MovePeak( x, flPeakPos );
	float flSharpened = Gain( flMovedPeak, flPeakSharpness );
	return SmoothCurve( flSharpened );
}

//-----------------------------------------------------------------------------
// make sure quaternions are within 180 degrees of one another, if not, reverse q
//-----------------------------------------------------------------------------

void QuaternionAlign( const Quaternion &p, const Quaternion &q, Quaternion &qt )
{
	Assert( s_bMathlibInitialized );

	// FIXME: can this be done with a quat dot product?

	int i;
	// decide if one of the quaternions is backwards
	float a = 0;
	float b = 0;
	for (i = 0; i < 4; i++) 
	{
		a += (p[i]-q[i])*(p[i]-q[i]);
		b += (p[i]+q[i])*(p[i]+q[i]);
	}
	if (a > b) 
	{
		for (i = 0; i < 4; i++) 
		{
			qt[i] = -q[i];
		}
	}
	else if (&qt != &q)
	{
		for (i = 0; i < 4; i++) 
		{
			qt[i] = q[i];
		}
	}
}


//-----------------------------------------------------------------------------
// Do a piecewise addition of the quaternion elements. This actually makes little 
// mathematical sense, but it's a cheap way to simulate a slerp.
//-----------------------------------------------------------------------------

void QuaternionBlend( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
{
	Assert( s_bMathlibInitialized );

	// decide if one of the quaternions is backwards
	Quaternion q2;
	QuaternionAlign( p, q, q2 );

	QuaternionBlendNoAlign( p, q2, t, qt );
}


void QuaternionBlendNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
{
	Assert( s_bMathlibInitialized );
	float sclp, sclq;
	int i;

	// 0.0 returns p, 1.0 return q.
	sclp = 1.0f - t;
	sclq = t;
	for (i = 0; i < 4; i++) {
		qt[i] = sclp * p[i] + sclq * q[i];
	}
	QuaternionNormalize( qt );
}



void QuaternionIdentityBlend( const Quaternion &p, float t, Quaternion &qt )
{
	Assert( s_bMathlibInitialized );
	float sclp;

	sclp = 1.0f - t;

	qt.x = p.x * sclp;
	qt.y = p.y * sclp;
	qt.z = p.z * sclp;
	if (qt.w < 0.0)
	{
		qt.w = p.w * sclp - t;
	}
	else
	{
		qt.w = p.w * sclp + t;
	}
	QuaternionNormalize( qt );
}

//-----------------------------------------------------------------------------
// Quaternion sphereical linear interpolation
//-----------------------------------------------------------------------------

void QuaternionSlerp( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
{
	Quaternion q2;
	// 0.0 returns p, 1.0 return q.

	// decide if one of the quaternions is backwards
	QuaternionAlign( p, q, q2 );

	QuaternionSlerpNoAlign( p, q2, t, qt );
}


void QuaternionSlerpNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
{
	Assert( s_bMathlibInitialized );
	float omega, cosom, sinom, sclp, sclq;
	int i;

	// 0.0 returns p, 1.0 return q.

	cosom = p[0]*q[0] + p[1]*q[1] + p[2]*q[2] + p[3]*q[3];

	if ((1.0f + cosom) > 0.000001f) {
		if ((1.0f - cosom) > 0.000001f) {
			omega = acos( cosom );
			sinom = sin( omega );
			sclp = sin( (1.0f - t)*omega) / sinom;
			sclq = sin( t*omega ) / sinom;
		}
		else {
			// TODO: add short circuit for cosom == 1.0f?
			sclp = 1.0f - t;
			sclq = t;
		}
		for (i = 0; i < 4; i++) {
			qt[i] = sclp * p[i] + sclq * q[i];
		}
	}
	else {
		Assert( &qt != &q );

		qt[0] = -q[1];
		qt[1] = q[0];
		qt[2] = -q[3];
		qt[3] = q[2];
		sclp = sin( (1.0f - t) * (0.5f * M_PI));
		sclq = sin( t * (0.5f * M_PI));
		for (i = 0; i < 3; i++) {
			qt[i] = sclp * p[i] + sclq * qt[i];
		}
	}

	Assert( qt.IsValid() );
}


//-----------------------------------------------------------------------------
// Purpose: Returns the angular delta between the two normalized quaternions in degrees.
//-----------------------------------------------------------------------------
float QuaternionAngleDiff( const Quaternion &p, const Quaternion &q )
{
#if 1
	// this code path is here for 2 reasons:
	// 1 - acos maps 1-epsilon to values much larger than epsilon (vs asin, which maps epsilon to itself)
	//     this means that in floats, anything below ~0.05 degrees truncates to 0
	// 2 - normalized quaternions are frequently slightly non-normalized due to float precision issues,
	//     and the epsilon off of normalized can be several percents of a degree
	Quaternion qInv, diff;
	QuaternionConjugate( q, qInv );
	QuaternionMult( p, qInv, diff );

	float sinang = sqrt( diff.x * diff.x + diff.y * diff.y + diff.z * diff.z );
	float angle = RAD2DEG( 2 * asin( sinang ) );
	return angle;
#else
	Quaternion q2;
	QuaternionAlign( p, q, q2 );

	Assert( s_bMathlibInitialized );
	float cosom = p.x * q2.x + p.y * q2.y + p.z * q2.z + p.w * q2.w;

	if ( cosom > -1.0f )
	{
		if ( cosom < 1.0f )
		{
			float omega = 2 * fabs( acos( cosom ) );
			return RAD2DEG( omega );
		}
		return 0.0f;
	}

	return 180.0f;
#endif
}

void QuaternionConjugate( const Quaternion &p, Quaternion &q )
{
	Assert( s_bMathlibInitialized );
	Assert( q.IsValid() );

	q.x = -p.x;
	q.y = -p.y;
	q.z = -p.z;
	q.w = p.w;
}

void QuaternionInvert( const Quaternion &p, Quaternion &q )
{
	Assert( s_bMathlibInitialized );
	Assert( q.IsValid() );

	QuaternionConjugate( p, q );

	float magnitudeSqr = QuaternionDotProduct( p, p );
	Assert( magnitudeSqr );
	if ( magnitudeSqr )
	{
		float inv = 1.0f / magnitudeSqr;
		q.x *= inv;
		q.y *= inv;
		q.z *= inv;
		q.w *= inv;
	}
}

//-----------------------------------------------------------------------------
// Make sure the quaternion is of unit length
//-----------------------------------------------------------------------------

float QuaternionNormalize( Quaternion &q )
{
	Assert( s_bMathlibInitialized );
	float radius, iradius;

	Assert( q.IsValid() );

	radius = q[0]*q[0] + q[1]*q[1] + q[2]*q[2] + q[3]*q[3];

	if ( radius ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))
	{
		radius = sqrt(radius);
		iradius = 1.0f/radius;
		q[3] *= iradius;
		q[2] *= iradius;
		q[1] *= iradius;
		q[0] *= iradius;
	}
	return radius;
}

void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
{
	Assert( s_bMathlibInitialized );

#if 0
	Quaternion p0;
	Quaternion q;
	p0.Init( 0.0, 0.0, 0.0, 1.0 );

	// slerp in "reverse order" so that p doesn't get realigned
	QuaternionSlerp( p, p0, 1.0 - fabs( t ), q );
	if (t < 0.0)
	{
		q.w = -q.w;
	}
#else
	float r;

	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
	float sinom = sqrt( DotProduct( &p.x, &p.x ) );
	sinom = min( sinom, 1.f );

	float sinsom = sin( asin( sinom ) * t );

	t = sinsom / (sinom + FLT_EPSILON);
	VectorScale( &p.x, t, &q.x );

	// rescale rotation
	r = 1.0f - sinsom * sinsom;

	// Assert( r >= 0 );
	if (r < 0.0f) 
		r = 0.0f;
	r = sqrt( r );

	// keep sign of rotation
	if (p.w < 0)
		q.w = -r;
	else
		q.w = r;
#endif

	Assert( q.IsValid() );

	return;
}


void QuaternionAdd( const Quaternion &p, const Quaternion &q, Quaternion &qt )
{
	Assert( s_bMathlibInitialized );
	Assert( p.IsValid() );
	Assert( q.IsValid() );

	// decide if one of the quaternions is backwards
	Quaternion q2;
	QuaternionAlign( p, q, q2 );

	// is this right???
	qt[0] = p[0] + q2[0];
	qt[1] = p[1] + q2[1];
	qt[2] = p[2] + q2[2];
	qt[3] = p[3] + q2[3];

	return;
}


float QuaternionDotProduct( const Quaternion &p, const Quaternion &q )
{
	Assert( s_bMathlibInitialized );
	Assert( p.IsValid() );
	Assert( q.IsValid() );

	return p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w;
}


// qt = p * q
void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt )
{
	Assert( s_bMathlibInitialized );
	Assert( p.IsValid() );
	Assert( q.IsValid() );

	if (&p == &qt)
	{
		Quaternion p2 = p;
		QuaternionMult( p2, q, qt );
		return;
	}

	// decide if one of the quaternions is backwards
	Quaternion q2;
	QuaternionAlign( p, q, q2 );

	qt.x =  p.x * q2.w + p.y * q2.z - p.z * q2.y + p.w * q2.x;
	qt.y = -p.x * q2.z + p.y * q2.w + p.z * q2.x + p.w * q2.y;
	qt.z =  p.x * q2.y - p.y * q2.x + p.z * q2.w + p.w * q2.z;
	qt.w = -p.x * q2.x - p.y * q2.y - p.z * q2.z + p.w * q2.w;
}


void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t& matrix )
{
	Assert( pos.IsValid() );

	QuaternionMatrix( q, matrix );

	matrix[0][3] = pos.x;
	matrix[1][3] = pos.y;
	matrix[2][3] = pos.z;
}

void QuaternionMatrix( const Quaternion &q, matrix3x4_t& matrix )
{
	Assert( s_bMathlibInitialized );
	Assert( q.IsValid() );

#ifdef _VPROF_MATHLIB
	VPROF_BUDGET( "QuaternionMatrix", "Mathlib" );
#endif

// Original code
// This should produce the same code as below with optimization, but looking at the assmebly,
// it doesn't.  There are 7 extra multiplies in the release build of this, go figure.
#if 1
	matrix[0][0] = 1.0 - 2.0 * q.y * q.y - 2.0 * q.z * q.z;
	matrix[1][0] = 2.0 * q.x * q.y + 2.0 * q.w * q.z;
	matrix[2][0] = 2.0 * q.x * q.z - 2.0 * q.w * q.y;

	matrix[0][1] = 2.0f * q.x * q.y - 2.0f * q.w * q.z;
	matrix[1][1] = 1.0f - 2.0f * q.x * q.x - 2.0f * q.z * q.z;
	matrix[2][1] = 2.0f * q.y * q.z + 2.0f * q.w * q.x;

	matrix[0][2] = 2.0f * q.x * q.z + 2.0f * q.w * q.y;
	matrix[1][2] = 2.0f * q.y * q.z - 2.0f * q.w * q.x;
	matrix[2][2] = 1.0f - 2.0f * q.x * q.x - 2.0f * q.y * q.y;
#else
   float wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;

    // precalculate common multiplitcations
    x2 = q.x + q.x; 
	y2 = q.y + q.y; 
    z2 = q.z + q.z;
    xx = q.x * x2;
	xy = q.x * y2;
	xz = q.x * z2;
    yy = q.y * y2;
	yz = q.y * z2;
	zz = q.z * z2;
    wx = q.w * x2;
	wy = q.w * y2;
	wz = q.w * z2;

    matrix[0][0] = 1.0 - (yy + zz);
    matrix[0][1] = xy - wz;
	matrix[0][2] = xz + wy;
                           
    matrix[1][0] = xy + wz;
	matrix[1][1] = 1.0 - (xx + zz);
    matrix[1][2] = yz - wx;

    matrix[2][0] = xz - wy;
	matrix[2][1] = yz + wx;
    matrix[2][2] = 1.0 - (xx + yy);
#endif
}

//-----------------------------------------------------------------------------
// Purpose: Converts engine-format euler angles to a quaternion
// Input  : angles - Right-handed Euler angles in degrees as follows:
//				[0]: PITCH: Clockwise rotation around the Y axis.
//				[1]: YAW:	Counterclockwise rotation around the Z axis.
//				[2]: ROLL:	Counterclockwise rotation around the X axis.
//			*outQuat - quaternion of form (i,j,k,real)
//-----------------------------------------------------------------------------
void AngleQuaternion( const QAngle &angles, Quaternion &outQuat )
{
	RadianEuler radians;
	radians.Init( DEG2RAD( angles.z ), DEG2RAD( angles.x ), DEG2RAD( angles.y ) );

	AngleQuaternion( radians, outQuat );
}

//-----------------------------------------------------------------------------
// Purpose: Converts a quaternion into engine angles
// Input  : *quaternion - q3 + q0.i + q1.j + q2.k
//			*outAngles - PITCH, YAW, ROLL
//-----------------------------------------------------------------------------
void QuaternionAngles( const Quaternion &q, QAngle &angles )
{
	Assert( s_bMathlibInitialized );
	Assert( q.IsValid() );

#ifdef _VPROF_MATHLIB
	VPROF_BUDGET( "QuaternionAngles", "Mathlib" );
#endif

#if 1
	// FIXME: doing it this way calculates too much data, needs to do an optimized version...
	matrix3x4_t matrix;
	QuaternionMatrix( q, matrix );
	MatrixAngles( matrix, angles );
#else
	float m11, m12, m13, m23, m33;

	m11 = ( 2.0f * q.w * q.w ) + ( 2.0f * q.x * q.x ) - 1.0f;
	m12 = ( 2.0f * q.x * q.y ) + ( 2.0f * q.w * q.z );
	m13 = ( 2.0f * q.x * q.z ) - ( 2.0f * q.w * q.y );
	m23 = ( 2.0f * q.y * q.z ) + ( 2.0f * q.w * q.x );
	m33 = ( 2.0f * q.w * q.w ) + ( 2.0f * q.z * q.z ) - 1.0f;

	// FIXME: this code has a singularity near PITCH +-90
	angles[YAW] = RAD2DEG( atan2(m12, m11) );
	angles[PITCH] = RAD2DEG( asin(-m13) );
	angles[ROLL] = RAD2DEG( atan2(m23, m33) );
#endif

	Assert( angles.IsValid() );
}

//-----------------------------------------------------------------------------
// Purpose: Converts a quaternion to an axis / angle in degrees
//			(exponential map)
//-----------------------------------------------------------------------------
void QuaternionAxisAngle( const Quaternion &q, Vector &axis, float &angle )
{
	angle = RAD2DEG(2 * acos(q.w));
	if ( angle > 180 )
	{
		angle -= 360;
	}
	axis.x = q.x;
	axis.y = q.y;
	axis.z = q.z;
	VectorNormalize( axis );
}

//-----------------------------------------------------------------------------
// Purpose: Converts an exponential map (ang/axis) to a quaternion
//-----------------------------------------------------------------------------
void AxisAngleQuaternion( const Vector &axis, float angle, Quaternion &q )
{
	float sa, ca;
	
	SinCos( DEG2RAD(angle) * 0.5f, &sa, &ca );
	
	q.x = axis.x * sa;
	q.y = axis.y * sa;
	q.z = axis.z * sa;
	q.w = ca;
}


//-----------------------------------------------------------------------------
// Purpose: Converts radian-euler axis aligned angles to a quaternion
// Input  : *pfAngles - Right-handed Euler angles in radians
//			*outQuat - quaternion of form (i,j,k,real)
//-----------------------------------------------------------------------------
void AngleQuaternion( RadianEuler const &angles, Quaternion &outQuat )
{
	Assert( s_bMathlibInitialized );
//	Assert( angles.IsValid() );

#ifdef _VPROF_MATHLIB
	VPROF_BUDGET( "AngleQuaternion", "Mathlib" );
#endif

	float sr, sp, sy, cr, cp, cy;

	SinCos( angles.z * 0.5f, &sy, &cy );
	SinCos( angles.y * 0.5f, &sp, &cp );
	SinCos( angles.x * 0.5f, &sr, &cr );

	// NJS: for some reason VC6 wasn't recognizing the common subexpressions:
	float srXcp = sr * cp,
		  crXsp = cr * sp;
	outQuat.x = srXcp*cy-crXsp*sy; // X
	outQuat.y = crXsp*cy+srXcp*sy; // Y

	float crXcp = cr * cp,
		  srXsp = sr * sp;
	outQuat.z = crXcp*sy-srXsp*cy; // Z
	outQuat.w = crXcp*cy+srXsp*sy; // W (real component)

//	Assert( outQuat.IsValid() );
}


//-----------------------------------------------------------------------------
// Purpose: Converts a basis to a quaternion
//-----------------------------------------------------------------------------
void BasisToQuaternion( const Vector &vecForward, const Vector &vecRight, const Vector &vecUp, Quaternion &q )
{
	Assert( fabs( vecForward.LengthSqr() - 1.0f ) < 1e-3 );
	Assert( fabs( vecRight.LengthSqr() - 1.0f ) < 1e-3 );
	Assert( fabs( vecUp.LengthSqr() - 1.0f ) < 1e-3 );

	Vector vecLeft;
	VectorMultiply( vecRight, -1.0f, vecLeft );

	// FIXME: Don't know why, but this doesn't match at all with other result
	// so we can't use this super-fast way.
	/*
	// Find the trace of the matrix:
	float flTrace = vecForward.x + vecLeft.y + vecUp.z + 1.0f;
	if ( flTrace > 1e-6 )
	{
		float flSqrtTrace = FastSqrt( flTrace );
		float s = 0.5f / flSqrtTrace;
		q.x = ( vecUp.y - vecLeft.z ) * s;
		q.y = ( vecForward.z - vecUp.x ) * s;
		q.z = ( vecLeft.x - vecForward.y ) * s;
		q.w = 0.5f * flSqrtTrace;
	}
	else
	{
		if (( vecForward.x > vecLeft.y ) && ( vecForward.x > vecUp.z ) )
		{
			float flSqrtTrace = FastSqrt( 1.0f + vecForward.x - vecLeft.y - vecUp.z );
			float s = 0.5f / flSqrtTrace;
			q.x = 0.5f * flSqrtTrace;
			q.y = ( vecForward.y + vecLeft.x ) * s;
			q.z = ( vecUp.x + vecForward.z ) * s;
			q.w = ( vecUp.y - vecLeft.z ) * s;
		}
		else if ( vecLeft.y > vecUp.z )
		{
			float flSqrtTrace = FastSqrt( 1.0f + vecLeft.y - vecForward.x - vecUp.z );
			float s = 0.5f / flSqrtTrace;
			q.x = ( vecForward.y + vecLeft.x ) * s;
			q.y = 0.5f * flSqrtTrace;
			q.z = ( vecUp.y + vecLeft.z ) * s;
			q.w = ( vecForward.z - vecUp.x ) * s;
		}
		else
		{
			float flSqrtTrace = FastSqrt( 1.0 + vecUp.z - vecForward.x - vecLeft.y );
			float s = 0.5f / flSqrtTrace;
			q.x = ( vecUp.x + vecForward.z ) * s;
			q.y = ( vecUp.y + vecLeft.z ) * s;
			q.z = 0.5f * flSqrtTrace;
			q.w = ( vecLeft.x - vecForward.y ) * s;
		}
	}
	QuaternionNormalize( q );
	*/

	// Version 2: Go through angles

	matrix3x4_t mat;
	MatrixSetColumn( vecForward, 0, mat );
	MatrixSetColumn( vecLeft, 1, mat );
	MatrixSetColumn( vecUp, 2, mat );

	QAngle angles;
	MatrixAngles( mat, angles );

//	Quaternion q2;
	AngleQuaternion( angles, q );

//	Assert( fabs(q.x - q2.x) < 1e-3 );
//	Assert( fabs(q.y - q2.y) < 1e-3 );
//	Assert( fabs(q.z - q2.z) < 1e-3 );
//	Assert( fabs(q.w - q2.w) < 1e-3 );
}


//-----------------------------------------------------------------------------
// Purpose: Converts a quaternion into engine angles
// Input  : *quaternion - q3 + q0.i + q1.j + q2.k
//			*outAngles - PITCH, YAW, ROLL
//-----------------------------------------------------------------------------
void QuaternionAngles( const Quaternion &q, RadianEuler &angles )
{
	Assert( s_bMathlibInitialized );
	Assert( q.IsValid() );

	// FIXME: doing it this way calculates too much data, needs to do an optimized version...
	matrix3x4_t matrix;
	QuaternionMatrix( q, matrix );
	MatrixAngles( matrix, angles );

	Assert( angles.IsValid() );
}

//-----------------------------------------------------------------------------
// Purpose: A helper function to normalize p2.x->p1.x and p3.x->p4.x to 
//  be the same length as p2.x->p3.x
// Input  : &p2 - 
//			&p4 - 
//			p4n - 
//-----------------------------------------------------------------------------
void Spline_Normalize( 
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	Vector& p1n,
	Vector& p4n )
{
	float dt = p3.x - p2.x;

	p1n = p1;
	p4n = p4;

	if ( dt != 0.0 )
	{
		if (p1.x != p2.x)
		{
			// Equivalent to p1n = p2 - (p2 - p1) * (dt / (p2.x - p1.x));
			VectorLerp( p2, p1, dt / (p2.x - p1.x), p1n );
		}
		if (p4.x != p3.x)
		{
			// Equivalent to p4n = p3 + (p4 - p3) * (dt / (p4.x - p3.x));
			VectorLerp( p3, p4, dt / (p4.x - p3.x), p4n );
		}
	}
}

//-----------------------------------------------------------------------------
// Purpose: 
// Input  : 
//-----------------------------------------------------------------------------

void Catmull_Rom_Spline(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Assert( s_bMathlibInitialized );
	float tSqr = t*t*0.5f;
	float tSqrSqr = t*tSqr;
	t *= 0.5f;

	Assert( &output != &p1 );
	Assert( &output != &p2 );
	Assert( &output != &p3 );
	Assert( &output != &p4 );

	output.Init();

	Vector a, b, c, d;

	// matrix row 1
	VectorScale( p1, -tSqrSqr, a );		// 0.5 t^3 * [ (-1*p1) + ( 3*p2) + (-3*p3) + p4 ]
	VectorScale( p2, tSqrSqr*3, b );
	VectorScale( p3, tSqrSqr*-3, c );
	VectorScale( p4, tSqrSqr, d );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
	VectorAdd( d, output, output );

	// matrix row 2
	VectorScale( p1, tSqr*2,  a );		// 0.5 t^2 * [ ( 2*p1) + (-5*p2) + ( 4*p3) - p4 ]
	VectorScale( p2, tSqr*-5, b );
	VectorScale( p3, tSqr*4,  c );
	VectorScale( p4, -tSqr,    d );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
	VectorAdd( d, output, output );

	// matrix row 3
	VectorScale( p1, -t, a );			// 0.5 t * [ (-1*p1) + p3 ]
	VectorScale( p3, t,  b );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );

	// matrix row 4
	VectorAdd( p2, output, output );	// p2
}

void Catmull_Rom_Spline_Tangent(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Assert( s_bMathlibInitialized );
	float tOne = 3*t*t*0.5f;
	float tTwo = 2*t*0.5f;
	float tThree = 0.5;

	Assert( &output != &p1 );
	Assert( &output != &p2 );
	Assert( &output != &p3 );
	Assert( &output != &p4 );

	output.Init();

	Vector a, b, c, d;

	// matrix row 1
	VectorScale( p1, -tOne, a );		// 0.5 t^3 * [ (-1*p1) + ( 3*p2) + (-3*p3) + p4 ]
	VectorScale( p2, tOne*3, b );
	VectorScale( p3, tOne*-3, c );
	VectorScale( p4, tOne, d );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
	VectorAdd( d, output, output );

	// matrix row 2
	VectorScale( p1, tTwo*2,  a );		// 0.5 t^2 * [ ( 2*p1) + (-5*p2) + ( 4*p3) - p4 ]
	VectorScale( p2, tTwo*-5, b );
	VectorScale( p3, tTwo*4,  c );
	VectorScale( p4, -tTwo,    d );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
	VectorAdd( d, output, output );

	// matrix row 3
	VectorScale( p1, -tThree, a );			// 0.5 t * [ (-1*p1) + p3 ]
	VectorScale( p3, tThree,  b );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
}

void Catmull_Rom_Spline_Normalize(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	// Normalize p2->p1 and p3->p4 to be the same length as p2->p3
	float dt = p3.DistTo(p2);

	Vector p1n, p4n;
	VectorSubtract( p1, p2, p1n );
	VectorSubtract( p4, p3, p4n );

	VectorNormalize( p1n );
	VectorNormalize( p4n );

	VectorMA( p2, dt, p1n, p1n );
	VectorMA( p3, dt, p4n, p4n );
	
	Catmull_Rom_Spline( p1n, p2, p3, p4n, t, output );
}


void Catmull_Rom_Spline_NormalizeX(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Vector p1n, p4n;
	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
	Catmull_Rom_Spline( p1n, p2, p3, p4n, t, output );
}


//-----------------------------------------------------------------------------
// Purpose: basic hermite spline.  t = 0 returns p1, t = 1 returns p2, 
//			d1 and d2 are used to entry and exit slope of curve
// Input  : 
//-----------------------------------------------------------------------------

void Hermite_Spline(
	const Vector &p1,
	const Vector &p2,
	const Vector &d1,
	const Vector &d2,
	float t, 
	Vector& output )
{
	Assert( s_bMathlibInitialized );
	float tSqr = t*t;
	float tCube = t*tSqr;

	Assert( &output != &p1 );
	Assert( &output != &p2 );
	Assert( &output != &d1 );
	Assert( &output != &d2 );

	float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
	float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
	float b3 = tCube-2*tSqr+t;
	float b4 = tCube-tSqr;

	VectorScale( p1, b1, output );
	VectorMA( output, b2, p2, output );
	VectorMA( output, b3, d1, output );
	VectorMA( output, b4, d2, output );
}

float Hermite_Spline(
	float p1,
	float p2,
	float d1,
	float d2,
	float t )
{
	Assert( s_bMathlibInitialized );
	float output;
	float tSqr = t*t;
	float tCube = t*tSqr;

	float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
	float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
	float b3 = tCube-2*tSqr+t;
	float b4 = tCube-tSqr;

	output = p1 * b1;
	output += p2 * b2;
	output += d1 * b3;
	output += d2 * b4;

	return output;
}


void Hermite_SplineBasis( float t, float basis[4] )
{
	float tSqr = t*t;
	float tCube = t*tSqr;

	basis[0] = 2.0f*tCube-3.0f*tSqr+1.0f;
	basis[1] = 1.0f - basis[0]; // -2*tCube+3*tSqr;
	basis[2] = tCube-2*tSqr+t;
	basis[3] = tCube-tSqr;
}

//-----------------------------------------------------------------------------
// Purpose: simple three data point hermite spline.  
//			t = 0 returns p1, t = 1 returns p2, 
//			slopes are generated from the p0->p1 and p1->p2 segments
//			this is reasonable C1 method when there's no "p3" data yet.
// Input  : 
//-----------------------------------------------------------------------------

// BUG: the VectorSubtract()'s calls go away if the global optimizer is enabled
#pragma optimize( "g", off )

void Hermite_Spline( const Vector &p0, const Vector &p1, const Vector &p2, float t, Vector& output )
{
	Vector e10, e21;
	VectorSubtract( p1, p0, e10 );
	VectorSubtract( p2, p1, e21 );
	Hermite_Spline( p1, p2, e10, e21, t, output );
}

#pragma optimize( "", on )

float Hermite_Spline( float p0, float p1, float p2,	float t )
{
	return Hermite_Spline( p1, p2, p1 - p0, p2 - p1, t );
}


void Hermite_Spline( const Quaternion &q0, const Quaternion &q1, const Quaternion &q2, float t, Quaternion &output )
{
	// cheap, hacked version of quaternions
	Quaternion q0a;
	Quaternion q1a;

	QuaternionAlign( q2, q0, q0a );
	QuaternionAlign( q2, q1, q1a );

	output.x = Hermite_Spline( q0a.x, q1a.x, q2.x, t );
	output.y = Hermite_Spline( q0a.y, q1a.y, q2.y, t );
	output.z = Hermite_Spline( q0a.z, q1a.z, q2.z, t );
	output.w = Hermite_Spline( q0a.w, q1a.w, q2.w, t );

	QuaternionNormalize( output );
}

// See http://en.wikipedia.org/wiki/Kochanek-Bartels_curves
// 
// Tension:  -1 = Round -> 1 = Tight
// Bias:     -1 = Pre-shoot (bias left) -> 1 = Post-shoot (bias right)
// Continuity: -1 = Box corners -> 1 = Inverted corners
//
// If T=B=C=0 it's the same matrix as Catmull-Rom.
// If T=1 & B=C=0 it's the same as Cubic.
// If T=B=0 & C=-1 it's just linear interpolation
// 
// See http://news.povray.org/povray.binaries.tutorials/attachment/%3CXns91B880592482seed7@povray.org%3E/Splines.bas.txt
// for example code and descriptions of various spline types...
// 
void Kochanek_Bartels_Spline(
	float tension, 
	float bias, 
	float continuity,
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Assert( s_bMathlibInitialized );

	float ffa, ffb, ffc, ffd;

	ffa = ( 1.0f - tension ) * ( 1.0f + continuity ) * ( 1.0f + bias );
	ffb = ( 1.0f - tension ) * ( 1.0f - continuity ) * ( 1.0f - bias );
	ffc = ( 1.0f - tension ) * ( 1.0f - continuity ) * ( 1.0f + bias );
	ffd = ( 1.0f - tension ) * ( 1.0f + continuity ) * ( 1.0f - bias );

	float tSqr = t*t*0.5f;
	float tSqrSqr = t*tSqr;
	t *= 0.5f;

	Assert( &output != &p1 );
	Assert( &output != &p2 );
	Assert( &output != &p3 );
	Assert( &output != &p4 );

	output.Init();

	Vector a, b, c, d;

	// matrix row 1
	VectorScale( p1, tSqrSqr * -ffa, a );		
	VectorScale( p2, tSqrSqr * ( 4.0f + ffa - ffb - ffc ), b );
	VectorScale( p3, tSqrSqr * ( -4.0f + ffb + ffc - ffd ), c );
	VectorScale( p4, tSqrSqr * ffd, d );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
	VectorAdd( d, output, output );

	// matrix row 2
	VectorScale( p1, tSqr* 2 * ffa,  a );		
	VectorScale( p2, tSqr * ( -6 - 2 * ffa + 2 * ffb + ffc ), b );
	VectorScale( p3, tSqr * ( 6 - 2 * ffb - ffc + ffd ),  c );
	VectorScale( p4, tSqr * -ffd,    d );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
	VectorAdd( d, output, output );

	// matrix row 3
	VectorScale( p1, t * -ffa,  a );		
	VectorScale( p2, t * ( ffa - ffb ), b );
	VectorScale( p3, t * ffb,  c );
	// p4 unchanged

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );

	// matrix row 4
	// p1, p3, p4 unchanged
	// p2 is multiplied by 1 and added, so just added it directly

	VectorAdd( p2, output, output );
}

void Kochanek_Bartels_Spline_NormalizeX(
	float tension, 
	float bias, 
	float continuity,
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Vector p1n, p4n;
	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
	Kochanek_Bartels_Spline( tension, bias, continuity, p1n, p2, p3, p4n, t, output );
}

void Cubic_Spline(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Assert( s_bMathlibInitialized );

	float tSqr = t*t;
	float tSqrSqr = t*tSqr;

	Assert( &output != &p1 );
	Assert( &output != &p2 );
	Assert( &output != &p3 );
	Assert( &output != &p4 );

	output.Init();

	Vector a, b, c, d;

	// matrix row 1
	VectorScale( p2, tSqrSqr * 2, b );
	VectorScale( p3, tSqrSqr * -2, c );

	VectorAdd( b, output, output );
	VectorAdd( c, output, output );

	// matrix row 2
	VectorScale( p2, tSqr * -3, b );
	VectorScale( p3, tSqr * 3,  c );

	VectorAdd( b, output, output );
	VectorAdd( c, output, output );

	// matrix row 3
	// no influence
	// p4 unchanged

	// matrix row 4
	// p1, p3, p4 unchanged
	VectorAdd( p2, output, output );
}

void Cubic_Spline_NormalizeX(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Vector p1n, p4n;
	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
	Cubic_Spline( p1n, p2, p3, p4n, t, output );
}

void BSpline(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Assert( s_bMathlibInitialized );

	float oneOver6 = 1.0f / 6.0f;

	float tSqr = t * t * oneOver6;
	float tSqrSqr = t*tSqr;
	t *= oneOver6;

	Assert( &output != &p1 );
	Assert( &output != &p2 );
	Assert( &output != &p3 );
	Assert( &output != &p4 );

	output.Init();

	Vector a, b, c, d;

	// matrix row 1
	VectorScale( p1, -tSqrSqr, a );		
	VectorScale( p2, tSqrSqr * 3.0f, b );
	VectorScale( p3, tSqrSqr * -3.0f, c );
	VectorScale( p4, tSqrSqr, d );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
	VectorAdd( d, output, output );

	// matrix row 2
	VectorScale( p1, tSqr * 3.0f,  a );		
	VectorScale( p2, tSqr * -6.0f, b );
	VectorScale( p3, tSqr * 3.0f,  c );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );

	// matrix row 3
	VectorScale( p1, t * -3.0f,  a );		
	VectorScale( p3, t * 3.0f,  c );
	// p4 unchanged

	VectorAdd( a, output, output );
	VectorAdd( c, output, output );

	// matrix row 4
	// p1 and p3 scaled by 1.0f, so done below
	VectorScale( p1, oneOver6, a );
	VectorScale( p2, 4.0f * oneOver6, b );
	VectorScale( p3, oneOver6, c );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );
}

void BSpline_NormalizeX(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Vector p1n, p4n;
	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
	BSpline( p1n, p2, p3, p4n, t, output );
}

void Parabolic_Spline(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Assert( s_bMathlibInitialized );

	float tSqr = t*t*0.5f;
	t *= 0.5f;

	Assert( &output != &p1 );
	Assert( &output != &p2 );
	Assert( &output != &p3 );
	Assert( &output != &p4 );

	output.Init();

	Vector a, b, c, d;

	// matrix row 1
	// no influence from t cubed

	// matrix row 2
	VectorScale( p1, tSqr,  a );		
	VectorScale( p2, tSqr * -2.0f, b );
	VectorScale( p3, tSqr,  c );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
	VectorAdd( c, output, output );

	// matrix row 3
	VectorScale( p1, t * -2.0f,  a );		
	VectorScale( p2, t * 2.0f,  b );
	// p4 unchanged

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );

	// matrix row 4
	VectorScale( p1, 0.5f,  a );		
	VectorScale( p2, 0.5f,  b );

	VectorAdd( a, output, output );
	VectorAdd( b, output, output );
}

void Parabolic_Spline_NormalizeX(
	const Vector &p1,
	const Vector &p2,
	const Vector &p3,
	const Vector &p4,
	float t, 
	Vector& output )
{
	Vector p1n, p4n;
	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
	Parabolic_Spline( p1n, p2, p3, p4n, t, output );
}

//-----------------------------------------------------------------------------
// Purpose: Compress the input values for a ranged result such that from 75% to 200% smoothly of the range maps 
//-----------------------------------------------------------------------------

float RangeCompressor( float flValue, float flMin, float flMax, float flBase )
{
	flValue += flBase;

	// convert to 0 to 1 value
	float flMid = (flValue - flMin) / (flMax - flMin);
	// convert to -1 to 1 value
	float flTarget = flMid * 2 - 1;

	if (fabs(flTarget) > 0.75)
	{
		float t = (fabs(flTarget) - 0.75) / (1.25);
		if (t < 1.0)
		{
			if (flTarget > 0)
			{
				flTarget = Hermite_Spline( 0.75, 1, 0.75, 0, t );
			}
			else
			{
				flTarget = -Hermite_Spline( 0.75, 1, 0.75, 0, t );
			}
		}
		else
		{
			flTarget = (flTarget > 0) ? 1.0f : -1.0f;
		}
	}

	flMid = (flTarget + 1 ) / 2.0;
	flValue = flMin * (1 - flMid) + flMax * flMid;

	flValue -= flBase;

	return flValue;
}


//#pragma optimize( "", on )

//-----------------------------------------------------------------------------
// Transforms a AABB into another space; which will inherently grow the box.
//-----------------------------------------------------------------------------
void TransformAABB( const matrix3x4_t& transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
{
	Vector localCenter;
	VectorAdd( vecMinsIn, vecMaxsIn, localCenter );
	localCenter *= 0.5f;

	Vector localExtents;
	VectorSubtract( vecMaxsIn, localCenter, localExtents );

	Vector worldCenter;
	VectorTransform( localCenter, transform, worldCenter );

	Vector worldExtents;
	worldExtents.x = DotProductAbs( localExtents, transform[0] );
	worldExtents.y = DotProductAbs( localExtents, transform[1] );
	worldExtents.z = DotProductAbs( localExtents, transform[2] );

	VectorSubtract( worldCenter, worldExtents, vecMinsOut );
	VectorAdd( worldCenter, worldExtents, vecMaxsOut );
}


//-----------------------------------------------------------------------------
// Uses the inverse transform of in1
//-----------------------------------------------------------------------------
void ITransformAABB( const matrix3x4_t& transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
{
	Vector worldCenter;
	VectorAdd( vecMinsIn, vecMaxsIn, worldCenter );
	worldCenter *= 0.5f;

	Vector worldExtents;
	VectorSubtract( vecMaxsIn, worldCenter, worldExtents );

	Vector localCenter;
	VectorITransform( worldCenter, transform, localCenter );

	Vector localExtents;
	localExtents.x =	FloatMakePositive( worldExtents.x * transform[0][0] ) + 
						FloatMakePositive( worldExtents.y * transform[1][0] ) + 
						FloatMakePositive( worldExtents.z * transform[2][0] );
	localExtents.y =	FloatMakePositive( worldExtents.x * transform[0][1] ) + 
						FloatMakePositive( worldExtents.y * transform[1][1] ) + 
						FloatMakePositive( worldExtents.z * transform[2][1] );
	localExtents.z =	FloatMakePositive( worldExtents.x * transform[0][2] ) + 
						FloatMakePositive( worldExtents.y * transform[1][2] ) + 
						FloatMakePositive( worldExtents.z * transform[2][2] );

	VectorSubtract( localCenter, localExtents, vecMinsOut );
	VectorAdd( localCenter, localExtents, vecMaxsOut );
}


//-----------------------------------------------------------------------------
// Rotates a AABB into another space; which will inherently grow the box. 
// (same as TransformAABB, but doesn't take the translation into account)
//-----------------------------------------------------------------------------
void RotateAABB( const matrix3x4_t &transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
{
	Vector localCenter;
	VectorAdd( vecMinsIn, vecMaxsIn, localCenter );
	localCenter *= 0.5f;

	Vector localExtents;
	VectorSubtract( vecMaxsIn, localCenter, localExtents );

	Vector newCenter;
	VectorRotate( localCenter, transform, newCenter );

	Vector newExtents;
	newExtents.x = DotProductAbs( localExtents, transform[0] );
	newExtents.y = DotProductAbs( localExtents, transform[1] );
	newExtents.z = DotProductAbs( localExtents, transform[2] );

	VectorSubtract( newCenter, newExtents, vecMinsOut );
	VectorAdd( newCenter, newExtents, vecMaxsOut );
}


//-----------------------------------------------------------------------------
// Uses the inverse transform of in1
//-----------------------------------------------------------------------------
void IRotateAABB( const matrix3x4_t &transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
{
	Vector oldCenter;
	VectorAdd( vecMinsIn, vecMaxsIn, oldCenter );
	oldCenter *= 0.5f;

	Vector oldExtents;
	VectorSubtract( vecMaxsIn, oldCenter, oldExtents );

	Vector newCenter;
	VectorIRotate( oldCenter, transform, newCenter );

	Vector newExtents;
	newExtents.x =	FloatMakePositive( oldExtents.x * transform[0][0] ) + 
					FloatMakePositive( oldExtents.y * transform[1][0] ) + 
					FloatMakePositive( oldExtents.z * transform[2][0] );
	newExtents.y =	FloatMakePositive( oldExtents.x * transform[0][1] ) + 
					FloatMakePositive( oldExtents.y * transform[1][1] ) + 
					FloatMakePositive( oldExtents.z * transform[2][1] );
	newExtents.z =	FloatMakePositive( oldExtents.x * transform[0][2] ) + 
					FloatMakePositive( oldExtents.y * transform[1][2] ) + 
					FloatMakePositive( oldExtents.z * transform[2][2] );

	VectorSubtract( newCenter, newExtents, vecMinsOut );
	VectorAdd( newCenter, newExtents, vecMaxsOut );
}


float CalcSqrDistanceToAABB( const Vector &mins, const Vector &maxs, const Vector &point )
{
	float flDelta;
	float flDistSqr = 0.0f;

	if ( point.x < mins.x )
	{
		flDelta = (mins.x - point.x);
		flDistSqr += flDelta * flDelta;
	}
	else if ( point.x > maxs.x )
	{
		flDelta = (point.x - maxs.x);
		flDistSqr += flDelta * flDelta;
	}

	if ( point.y < mins.y )
	{
		flDelta = (mins.y - point.y);
		flDistSqr += flDelta * flDelta;
	}
	else if ( point.y > maxs.y )
	{
		flDelta = (point.y - maxs.y);
		flDistSqr += flDelta * flDelta;
	}

	if ( point.z < mins.z )
	{
		flDelta = (mins.z - point.z);
		flDistSqr += flDelta * flDelta;
	}
	else if ( point.z > maxs.z )
	{
		flDelta = (point.z - maxs.z);
		flDistSqr += flDelta * flDelta;
	}

	return flDistSqr;
}


void CalcClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut )
{
	for ( int i = 0; i < 3; i++ )
	{
		if ( point[i] < mins[i] )
		{
			closestOut[i] = mins[i];
		}
		else if ( point[i] > maxs[i] )
		{
			closestOut[i] = maxs[i];
		}
		else
		{
			closestOut[i] = point[i];
		}
	}
}

float CalcClosestPointToLineT( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vDir )
{
	Assert( s_bMathlibInitialized );
	VectorSubtract( vLineB, vLineA, vDir );

	// D dot [P - (A + D*t)] = 0
	// t = ( DP - DA) / DD
	float div = vDir.Dot( vDir );
	if( div < 0.00001f )
	{
		return 0;
	}
	else
	{
		return (vDir.Dot( P ) - vDir.Dot( vLineA )) / div;
	}
}

void CalcClosestPointOnLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector vDir;
	float t = CalcClosestPointToLineT( P, vLineA, vLineB, vDir );
	if ( outT ) *outT = t;
	vClosest.MulAdd( vLineA, vDir, t );
}


float CalcDistanceToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector vClosest;
	CalcClosestPointOnLine( P, vLineA, vLineB, vClosest, outT );
	return P.DistTo(vClosest);
}

float CalcDistanceSqrToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector vClosest;
	CalcClosestPointOnLine( P, vLineA, vLineB, vClosest, outT );
	return P.DistToSqr(vClosest);
}

void CalcClosestPointOnLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *outT )
{
	Vector vDir;
	float t = CalcClosestPointToLineT( P, vLineA, vLineB, vDir );
	t = clamp( t, 0, 1 );
	if ( outT ) 
	{
		*outT = t;
	}
	vClosest.MulAdd( vLineA, vDir, t );
}


float CalcDistanceToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector vClosest;
	CalcClosestPointOnLineSegment( P, vLineA, vLineB, vClosest, outT );
	return P.DistTo( vClosest );
}

float CalcDistanceSqrToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector vClosest;
	CalcClosestPointOnLineSegment( P, vLineA, vLineB, vClosest, outT );
	return P.DistToSqr(vClosest);
}

float CalcClosestPointToLineT2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vDir )
{
	Assert( s_bMathlibInitialized );
	Vector2DSubtract( vLineB, vLineA, vDir );

	// D dot [P - (A + D*t)] = 0
	// t = (DP - DA) / DD
	float div = vDir.Dot( vDir );
	if( div < 0.00001f )
	{
		return 0;
	}
	else
	{
		return (vDir.Dot( P ) - vDir.Dot( vLineA )) / div;
	}
}

void CalcClosestPointOnLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vClosest, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector2D vDir;
	float t = CalcClosestPointToLineT2D( P, vLineA, vLineB, vDir );
	if ( outT ) *outT = t;
	vClosest.MulAdd( vLineA, vDir, t );
}

float CalcDistanceToLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector2D vClosest;
	CalcClosestPointOnLine2D( P, vLineA, vLineB, vClosest, outT );
	return P.DistTo( vClosest );
}

float CalcDistanceSqrToLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector2D vClosest;
	CalcClosestPointOnLine2D( P, vLineA, vLineB, vClosest, outT );
	return P.DistToSqr(vClosest);
}

void CalcClosestPointOnLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vClosest, float *outT )
{
	Vector2D vDir;
	float t = CalcClosestPointToLineT2D( P, vLineA, vLineB, vDir );
	t = clamp( t, 0, 1 );
	if ( outT )
	{
		*outT = t;
	}
	vClosest.MulAdd( vLineA, vDir, t );
}

float CalcDistanceToLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector2D vClosest;
	CalcClosestPointOnLineSegment2D( P, vLineA, vLineB, vClosest, outT );
	return P.DistTo( vClosest );
}

float CalcDistanceSqrToLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
{
	Assert( s_bMathlibInitialized );
	Vector2D vClosest;
	CalcClosestPointOnLineSegment2D( P, vLineA, vLineB, vClosest, outT );
	return P.DistToSqr( vClosest );
}

// Do we have another epsilon we could use
#define LINE_EPS ( 0.000001f )

//-----------------------------------------------------------------------------
// Purpose: Given lines p1->p2 and p3->p4, computes a line segment (pa->pb) and returns the parameters 0->1 multipliers
//  along each segment for the returned points
// Input  : p1 - 
//			p2 - 
//			p3 - 
//			p4 - 
//			*s1 - 
//			*s2 - 
// Output : Returns true on success, false on failure.
//-----------------------------------------------------------------------------
bool CalcLineToLineIntersectionSegment(
   const Vector& p1,const Vector& p2,const Vector& p3,const Vector& p4,Vector *s1,Vector *s2,
   float *t1, float *t2)
{
   Vector p13,p43,p21;
   float d1343,d4321,d1321,d4343,d2121;
   float numer,denom;

   p13.x = p1.x - p3.x;
   p13.y = p1.y - p3.y;
   p13.z = p1.z - p3.z;
   p43.x = p4.x - p3.x;
   p43.y = p4.y - p3.y;
   p43.z = p4.z - p3.z;

   if (fabs(p43.x)  < LINE_EPS && fabs(p43.y)  < LINE_EPS && fabs(p43.z)  < LINE_EPS)
      return false;
   p21.x = p2.x - p1.x;
   p21.y = p2.y - p1.y;
   p21.z = p2.z - p1.z;
   if (fabs(p21.x)  < LINE_EPS && fabs(p21.y)  < LINE_EPS && fabs(p21.z)  < LINE_EPS)
      return false;

   d1343 = p13.x * p43.x + p13.y * p43.y + p13.z * p43.z;
   d4321 = p43.x * p21.x + p43.y * p21.y + p43.z * p21.z;
   d1321 = p13.x * p21.x + p13.y * p21.y + p13.z * p21.z;
   d4343 = p43.x * p43.x + p43.y * p43.y + p43.z * p43.z;
   d2121 = p21.x * p21.x + p21.y * p21.y + p21.z * p21.z;

   denom = d2121 * d4343 - d4321 * d4321;
   if (fabs(denom) < LINE_EPS)
      return false;
   numer = d1343 * d4321 - d1321 * d4343;

   *t1 = numer / denom;
   *t2 = (d1343 + d4321 * (*t1)) / d4343;

   s1->x = p1.x + *t1 * p21.x;
   s1->y = p1.y + *t1 * p21.y;
   s1->z = p1.z + *t1 * p21.z;
   s2->x = p3.x + *t2 * p43.x;
   s2->y = p3.y + *t2 * p43.y;
   s2->z = p3.z + *t2 * p43.z;

   return true;
}

#pragma optimize( "", off )

// stuff from windows.h
#ifndef _XBOX
typedef unsigned int DWORD;
#endif

#ifndef EXCEPTION_EXECUTE_HANDLER
#define EXCEPTION_EXECUTE_HANDLER       1
#endif


#pragma optimize( "", on )

static bool s_b3DNowEnabled = false;
static bool s_bMMXEnabled = false;
static bool s_bSSEEnabled = false;
static bool s_bSSE2Enabled = false;

void MathLib_Init( float gamma, float texGamma, float brightness, int overbright, bool bAllow3DNow, bool bAllowSSE, bool bAllowSSE2, bool bAllowMMX )
{
#ifdef _XBOX
	if (s_bMathlibInitialized)
		return;
#endif
	// FIXME: Hook SSE into VectorAligned + Vector4DAligned

	// Grab the processor information:
	const CPUInformation& pi = GetCPUInformation();

#ifndef _XBOX
	// Select the default generic routines.
	pfSqrt = _sqrtf;
	pfRSqrt = _rsqrtf;
	pfRSqrtFast = _rsqrtf;
	pfVectorNormalize = _VectorNormalize;
	pfVectorNormalizeFast = _VectorNormalizeFast;
	pfInvRSquared = _InvRSquared;
	pfFastSinCos = SinCos;
	pfFastCos = cosf;
#endif

	if ( bAllowMMX && pi.m_bMMX )
	{
		// Select the MMX specific routines if available
		// (MMX routines were used by SW span fillers - not currently used for HW)
		s_bMMXEnabled = true;
	}
	else
	{
		s_bMMXEnabled = false;
	}

#ifndef _XBOX
	// SSE Generally performs better than 3DNow when present, so this is placed 
	// first to allow SSE to override these settings.
	if ( bAllow3DNow && pi.m_b3DNow )
	{
		s_b3DNowEnabled = true;

		// Select the 3DNow specific routines if available;
		pfVectorNormalize = _3DNow_VectorNormalize;
		pfVectorNormalizeFast = _3DNow_VectorNormalizeFast;
		pfInvRSquared = _3DNow_InvRSquared;
		pfSqrt = _3DNow_Sqrt;
		pfRSqrt = _3DNow_RSqrt;
		pfRSqrtFast = _3DNow_RSqrt;
	}
	else
	{
		s_b3DNowEnabled = false;
	}
#endif

	if ( bAllowSSE && pi.m_bSSE )
	{
		s_bSSEEnabled = true;
#ifdef _XBOX
		_MM_SET_FLUSH_ZERO_MODE( _MM_FLUSH_ZERO_ON );
#endif

		// Select the SSE specific routines if available
		pfVectorNormalize = _VectorNormalize;
		pfVectorNormalizeFast = _SSE_VectorNormalizeFast;
		pfInvRSquared = _SSE_InvRSquared;
		pfSqrt = _SSE_Sqrt;
		pfRSqrt = _SSE_RSqrtAccurate;
		pfRSqrtFast = _SSE_RSqrtFast;
#ifdef _WIN32
		pfFastSinCos = _SSE_SinCos;
		pfFastCos = _SSE_cos;
#endif
	}
	else
	{
#ifdef _XBOX
		// sse expected
		Assert(0);
#endif
		s_bSSEEnabled = false;
	}

#ifndef _XBOX
	if ( bAllowSSE2 && pi.m_bSSE2 )
	{
		s_bSSE2Enabled = true;
#ifdef _WIN32
		pfFastSinCos = _SSE2_SinCos;
		pfFastCos = _SSE2_cos;
#endif
	} else
	{
		s_bSSE2Enabled = false;
	}
#endif

	s_bMathlibInitialized = true;

	InitSinCosTable();
	BuildGammaTable( gamma, texGamma, brightness, overbright );
}

bool MathLib_3DNowEnabled( void )
{
	Assert( s_bMathlibInitialized );
	return s_b3DNowEnabled;
}

bool MathLib_MMXEnabled( void )
{
	Assert( s_bMathlibInitialized );
	return s_bMMXEnabled;
}

bool MathLib_SSEEnabled( void )
{
	Assert( s_bMathlibInitialized );
	return s_bSSEEnabled;
}

bool MathLib_SSE2Enabled( void )
{
	Assert( s_bMathlibInitialized );
	return s_bSSE2Enabled;
}

float Approach( float target, float value, float speed )
{
	float delta = target - value;

	if ( delta > speed )
		value += speed;
	else if ( delta < -speed )
		value -= speed;
	else 
		value = target;

	return value;
}

// BUGBUG: Why doesn't this call angle diff?!?!?
float ApproachAngle( float target, float value, float speed )
{
	target = anglemod( target );
	value = anglemod( value );
	
	float delta = target - value;

	// Speed is assumed to be positive
	if ( speed < 0 )
		speed = -speed;

	if ( delta < -180 )
		delta += 360;
	else if ( delta > 180 )
		delta -= 360;

	if ( delta > speed )
		value += speed;
	else if ( delta < -speed )
		value -= speed;
	else 
		value = target;

	return value;
}


// BUGBUG: Why do we need both of these?
float AngleDiff( float destAngle, float srcAngle )
{
	float delta;

	delta = fmodf(destAngle - srcAngle, 360.0f);
	if ( destAngle > srcAngle )
	{
		if ( delta >= 180 )
			delta -= 360;
	}
	else
	{
		if ( delta <= -180 )
			delta += 360;
	}
	return delta;
}


float AngleDistance( float next, float cur )
{
	float delta = next - cur;

	if ( delta < -180 )
		delta += 360;
	else if ( delta > 180 )
		delta -= 360;

	return delta;
}


float AngleNormalize( float angle )
{
	angle = fmodf(angle, 360.0f);
	if (angle > 180) 
	{
		angle -= 360;
	}
	if (angle < -180)
	{
		angle += 360;
	}
	return angle;
}

//--------------------------------------------------------------------------------------------------------------
// ensure that 0 <= angle <= 360
float AngleNormalizePositive( float angle )
{
	angle = fmodf( angle, 360.0f );

	if (angle < 0.0f)
	{
		angle += 360.0f;
	}

	return angle;
}

//--------------------------------------------------------------------------------------------------------------
bool AnglesAreEqual( float a, float b, float tolerance )
{
	return (fabs( AngleDiff( a, b ) ) < tolerance);
}

void RotationDeltaAxisAngle( const QAngle &srcAngles, const QAngle &destAngles, Vector &deltaAxis, float &deltaAngle )
{
	Quaternion srcQuat, destQuat, srcQuatInv, out;
	AngleQuaternion( srcAngles, srcQuat );
	AngleQuaternion( destAngles, destQuat );
	QuaternionScale( srcQuat, -1, srcQuatInv );
	QuaternionMult( destQuat, srcQuatInv, out );

	QuaternionNormalize( out );
	QuaternionAxisAngle( out, deltaAxis, deltaAngle );
}

void RotationDelta( const QAngle &srcAngles, const QAngle &destAngles, QAngle *out )
{
	matrix3x4_t src, srcInv;
	matrix3x4_t dest;
	AngleMatrix( srcAngles, src );
	AngleMatrix( destAngles, dest );
	// xform = src(-1) * dest
	MatrixInvert( src, srcInv );
	matrix3x4_t xform;
	ConcatTransforms( dest, srcInv, xform );
	QAngle xformAngles;
	MatrixAngles( xform, xformAngles );
	if ( out )
	{
		*out = xformAngles;
	}
}

//-----------------------------------------------------------------------------
// Purpose: Computes a triangle normal
//-----------------------------------------------------------------------------
void ComputeTrianglePlane( const Vector& v1, const Vector& v2, const Vector& v3, Vector& normal, float& intercept )
{
	Vector e1, e2;
	VectorSubtract( v2, v1, e1 );
	VectorSubtract( v3, v1, e2 );
	CrossProduct( e1, e2, normal );
	VectorNormalize( normal );
	intercept = DotProduct( normal, v1 ); 
}

//-----------------------------------------------------------------------------
// Purpose: This is a clone of BaseWindingForPlane()
// Input  : *outVerts - an array of preallocated verts to build the polygon in
//			normal - the plane normal
//			dist - the plane constant
// Output : int - vert count (always 4)
//-----------------------------------------------------------------------------
int PolyFromPlane( Vector *outVerts, const Vector& normal, float dist, float fHalfScale )
{
	int		i, x;
	vec_t	max, v;
	Vector	org, vright, vup;

	// find the major axis

	max = -16384; //MAX_COORD_INTEGER
	x = -1;
	for (i=0 ; i<3; i++)
	{
		v = fabs(normal[i]);
		if (v > max)
		{
			x = i;
			max = v;
		}
	}

	if (x==-1)
		return 0;

	// Build a unit vector along something other than the major axis
	VectorCopy (vec3_origin, vup);	
	switch (x)
	{
	case 0:
	case 1:
		vup[2] = 1;
		break;		
	case 2:
		vup[0] = 1;
		break;		
	}

	// Remove the component of this vector along the normal
	v = DotProduct (vup, normal);
	VectorMA (vup, -v, normal, vup);
	// Make it a unit (perpendicular)
	VectorNormalize (vup);

	// Center of the poly is at normal * dist
	VectorScale (normal, dist, org);
	// Calculate the third orthonormal basis vector for our plane space (this one and vup are in the plane)
	CrossProduct (vup, normal, vright);

	// Make the plane's basis vectors big (these are the half-sides of the polygon we're making)
	VectorScale (vup, fHalfScale, vup);
	VectorScale (vright, fHalfScale, vright);

	// Move diagonally away from org to create the corner verts
	VectorSubtract (org, vright, outVerts[0]);	// left
	VectorAdd (outVerts[0], vup, outVerts[0]);	// up

	VectorAdd (org, vright, outVerts[1]);		// right
	VectorAdd (outVerts[1], vup, outVerts[1]);	// up

	VectorAdd (org, vright, outVerts[2]);		// right
	VectorSubtract (outVerts[2], vup, outVerts[2]);	// down

	VectorSubtract (org, vright, outVerts[3]);		// left
	VectorSubtract (outVerts[3], vup, outVerts[3]);	// down

	// The four corners form a planar quadrilateral normal to "normal"
	return 4;
}

//-----------------------------------------------------------------------------
// Purpose: clip a poly to the plane and return the poly on the front side of the plane
// Input  : *inVerts - input polygon
//			vertCount - # verts in input poly
//			*outVerts - destination poly
//			normal - plane normal
//			dist - plane constant
// Output : int - # verts in output poly
//-----------------------------------------------------------------------------

int ClipPolyToPlane( Vector *inVerts, int vertCount, Vector *outVerts, const Vector& normal, float dist, float fOnPlaneEpsilon )
{
	vec_t	*dists = (vec_t *)stackalloc( sizeof(vec_t) * vertCount * 4 ); //4x vertcount should cover all cases
	int		*sides = (int *)stackalloc( sizeof(vec_t) * vertCount * 4 );
	int		counts[3];
	vec_t	dot;
	int		i, j;
	Vector	mid = vec3_origin;
	int		outCount;

	counts[0] = counts[1] = counts[2] = 0;

	// determine sides for each point
	for ( i = 0; i < vertCount; i++ )
	{
		dot = DotProduct( inVerts[i], normal) - dist;
		dists[i] = dot;
		if ( dot > fOnPlaneEpsilon )
		{
			sides[i] = SIDE_FRONT;
		}
		else if ( dot < -fOnPlaneEpsilon )
		{
			sides[i] = SIDE_BACK;
		}
		else
		{
			sides[i] = SIDE_ON;
		}
		counts[sides[i]]++;
	}
	sides[i] = sides[0];
	dists[i] = dists[0];

	if (!counts[0])
		return 0;

	if (!counts[1])
	{
		// Copy to output verts
		for ( i = 0; i < vertCount; i++ )
		{
			VectorCopy( inVerts[i], outVerts[i] );
		}
		return vertCount;
	}

	outCount = 0;
	for ( i = 0; i < vertCount; i++ )
	{
		Vector& p1 = inVerts[i];

		if (sides[i] == SIDE_ON)
		{
			VectorCopy( p1, outVerts[outCount]);
			outCount++;
			continue;
		}

		if (sides[i] == SIDE_FRONT)
		{
			VectorCopy( p1, outVerts[outCount]);
			outCount++;
		}

		if (sides[i+1] == SIDE_ON || sides[i+1] == sides[i])
			continue;

		// generate a split point
		Vector& p2 = inVerts[(i+1)%vertCount];

		dot = dists[i] / (dists[i]-dists[i+1]);
		for (j=0 ; j<3 ; j++)
		{	// avoid round off error when possible
			if (normal[j] == 1)
				mid[j] = dist;
			else if (normal[j] == -1)
				mid[j] = -dist;
			else
				mid[j] = p1[j] + dot*(p2[j]-p1[j]);
		}

		VectorCopy (mid, outVerts[outCount]);
		outCount++;
	}

	return outCount;
}


int ClipPolyToPlane_Precise( double *inVerts, int vertCount, double *outVerts, const double *normal, double dist, double fOnPlaneEpsilon )
{
	double	*dists = (double *)stackalloc( sizeof(double) * vertCount * 4 ); //4x vertcount should cover all cases
	int		*sides = (int *)stackalloc( sizeof(double) * vertCount * 4 );
	int		counts[3];
	double	dot;
	int		i, j;
	//Vector	mid = vec3_origin;
	double mid[3];
	mid[0] = 0.0;
	mid[1] = 0.0;
	mid[2] = 0.0;
	int		outCount;

	counts[0] = counts[1] = counts[2] = 0;

	// determine sides for each point
	for ( i = 0; i < vertCount; i++ )
	{
		//dot = DotProduct( inVerts[i], normal) - dist;
		dot = ((inVerts[i*3 + 0] * normal[0]) + (inVerts[i*3 + 1] * normal[1]) + (inVerts[i*3 + 2] * normal[2])) - dist;
		dists[i] = dot;
		if ( dot > fOnPlaneEpsilon )
		{
			sides[i] = SIDE_FRONT;
		}
		else if ( dot < -fOnPlaneEpsilon )
		{
			sides[i] = SIDE_BACK;
		}
		else
		{
			sides[i] = SIDE_ON;
		}
		counts[sides[i]]++;
	}
	sides[i] = sides[0];
	dists[i] = dists[0];

	if (!counts[0])
		return 0;

	if (!counts[1])
	{
		// Copy to output verts
		//for ( i = 0; i < vertCount; i++ )
		for ( i = 0; i < vertCount * 3; i++ )
		{
			//VectorCopy( inVerts[i], outVerts[i] );
			outVerts[i] = inVerts[i];
		}
		return vertCount;
	}

	outCount = 0;
	for ( i = 0; i < vertCount; i++ )
	{
		//Vector& p1 = inVerts[i];
		double *p1 = &inVerts[i*3];
		//p1[0] = inVerts[i*3 + 0];
		//p1[1] = inVerts[i*3 + 1];
		//p1[2] = inVerts[i*3 + 2];

		if (sides[i] == SIDE_ON)
		{
			//VectorCopy( p1, outVerts[outCount]);
			outVerts[outCount*3 + 0] = p1[0];
			outVerts[outCount*3 + 1] = p1[1];
			outVerts[outCount*3 + 2] = p1[2];
			outCount++;
			continue;
		}

		if (sides[i] == SIDE_FRONT)
		{
			//VectorCopy( p1, outVerts[outCount]);
			outVerts[outCount*3 + 0] = p1[0];
			outVerts[outCount*3 + 1] = p1[1];
			outVerts[outCount*3 + 2] = p1[2];
			outCount++;
		}

		if (sides[i+1] == SIDE_ON || sides[i+1] == sides[i])
			continue;

		// generate a split point
		//Vector& p2 = inVerts[(i+1)%vertCount];
		int wrappedindex = (i+1)%vertCount;
		double *p2 = &inVerts[wrappedindex*3];
		//p2[0] = inVerts[wrappedindex*3 + 0];
		//p2[1] = inVerts[wrappedindex*3 + 1];
		//p2[2] = inVerts[wrappedindex*3 + 2];

		dot = dists[i] / (dists[i]-dists[i+1]);
		for (j=0 ; j<3 ; j++)
		{
			mid[j] = (double)p1[j] + dot*((double)p2[j]-(double)p1[j]);
		}

		//VectorCopy (mid, outVerts[outCount]);
		outVerts[outCount*3 + 0] = mid[0];
		outVerts[outCount*3 + 1] = mid[1];
		outVerts[outCount*3 + 2] = mid[2];
		outCount++;
	}

	return outCount;
}

void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector& out )
{
	Assert( s_bMathlibInitialized );
	// FIXME: Why is there a factor of 255 built into this?
	out.x = 255.0f * TexLightToLinear( in.r, in.exponent );
	out.y = 255.0f * TexLightToLinear( in.g, in.exponent );
	out.z = 255.0f * TexLightToLinear( in.b, in.exponent );
}

// assumes that the desired mantissa range is 0..255
static int VectorToColorRGBExp32_CalcExponent( float in )
{
	int power = 0;
	
	if( in != 0.0f )
	{
		while( in > 255.0f )
		{
			power += 1;
			in *= 0.5f;
		}
		
		while( in < 127.0f )
		{
			power -= 1;
			in *= 2.0f;
		}
	}

	return power;
}

void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
{
	Vector v = vin;
	Assert( s_bMathlibInitialized );
	Assert( v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f );
	int i;		
	float max = v[0];				
	for( i = 1; i < 3; i++ )
	{
		// Get the maximum value.
		if( v[i] > max )
		{
			max = v[i];
		}
	}
				
	// figure out the exponent for this luxel.
	int exponent = VectorToColorRGBExp32_CalcExponent( max );
				
	// make the exponent fits into a signed byte.
	if( exponent < -128 )
	{
		exponent = -128;
	}
	else if( exponent > 127 )
	{
		exponent = 127;
	}
				
	// undone: optimize with a table
	float scalar = pow( 2.0f, -exponent );
	// convert to mantissa x 2^exponent format
	for( i = 0; i < 3; i++ )
	{
		v[i] *= scalar;
		// clamp
		if( v[i] > 255.0f )
		{
			v[i] = 255.0f;
		}
	}
	c.r = ( unsigned char )v[0];
	c.g = ( unsigned char )v[1];
	c.b = ( unsigned char )v[2];
	c.exponent = ( signed char )exponent;
}

int CeilPow2( int in )
{
	int retval;
	
	retval = 1;
	while( retval < in )
		retval <<= 1;
	return retval;
}

int FloorPow2( int in )
{
	int retval;
	
	retval = 1;
	while( retval < in )
		retval <<= 1;
	return retval >> 1;
}

bool R_CullBox( const Vector& mins, const Vector& maxs, const Frustum_t &frustum )
{
	return (( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT) ) == 2 ) || 
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_LEFT) ) == 2 ) ||
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_TOP) ) == 2 ) ||
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM) ) == 2 ) ||
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_NEARZ) ) == 2 ) ||
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_FARZ) ) == 2 ) );
}

bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t &frustum )
{
	return (( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT) ) == 2 ) || 
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_LEFT) ) == 2 ) ||
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_TOP) ) == 2 ) ||
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM) ) == 2 ) ||
			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_FARZ) ) == 2 ) );
}

#endif // !_STATIC_LINKED || _SHARED_LIB
#	Change	User	Description	Committed
#1	5821	Knut Wikstrom	Added Valve Source code. This is NOT to be commited to other than new code from Valve.