/* 
    Implementation of SIMD functionality for Intel SSE with VC++ compiler
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

#if defined(NT) && defined(_MSC_VER)

t_int *zero_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		edx,dword ptr [esi + 1*TYPE t_int] /* out */
		/* load zero */
		xorps	xmm0,xmm0

		mov		ecx,[esi + 2*TYPE t_int] /* n */
		shr		ecx,4

		/* should we do more loop unrolling? */
loopa:
		movaps	xmmword ptr[edx],xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm0

		add		edx,16*TYPE t_float
		/* very short loop - let's assume that branch prediction does its job nicely */
		loop	loopa
	}
    return (w+3);
}


t_int *copy_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 2*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 3*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+12*TYPE t_float] */

		movaps	xmm0,xmmword ptr[ebx]
		movaps	xmmword ptr[edx],xmm0
		movaps	xmm1,xmmword ptr[ebx+4*TYPE t_float]
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm1

/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm2,xmmword ptr[ebx+8*TYPE t_float]
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm2
		movaps	xmm3,xmmword ptr[ebx+12*TYPE t_float]
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm3

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+4);
}


t_int *sig_tilde_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		edx,dword ptr [esi + 2*TYPE t_int] /* out */
		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 1*TYPE t_int] /* f */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 3*TYPE t_int] /* n */
		shr		ecx,4

loopa:
		movaps	xmmword ptr[edx],xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm0

		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+4);
}


t_int *plus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax] prefetch first cache line */	
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx] prefetch first cache line */	
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */
		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4 /* divide by 16 */

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
        movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		addps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		addps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/
		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		addps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		addps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

        add     esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarplus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx] prefetch first cache line */
		
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* value */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		addps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		addps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		addps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		addps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *minus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */
		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		subps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		subps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		subps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		subps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarminus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		subps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		subps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		subps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		subps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *times_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		mulps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		mulps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
		prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		mulps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		mulps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalartimes_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		mulps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		mulps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*4] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		mulps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		mulps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

/* no checking for 0 yet!! */
t_int *over_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		divps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		divps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
		prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		divps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		divps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarover_perf_sse_vc(t_int *w)
{
    static const float one = 1.f;

	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm1,xmmword ptr [eax]

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

        /* check for zero */
		xorps	xmm0,xmm0
        comiss  xmm1,xmm0  /* compare xmm1 to 0 */
        /* if xmm1 is zero (and also xmm0!) -> goto loopa */
        jz      loopa

        /* else, invert xmm0 */
/*        rcpps   xmm0,xmm0  ... far too unprecise!! */
        
		movss	xmm0,[one]
        divss   xmm0,xmm1  /* divide xmm0 by xmm1 */
		shufps	xmm0,xmm0,0 /* make xmm0 all the same */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		mulps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		mulps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*4] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		mulps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		mulps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *max_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [eax] */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
        prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		maxps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		maxps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		maxps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		maxps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarmax_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		maxps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		maxps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		maxps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		maxps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *min_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [eax] */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

        mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
        prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		minps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		minps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/
		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		minps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		minps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarmin_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		minps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		minps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		minps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		minps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

#endif
