/*******************************************************************************
*	fische - standalone sound visualisation for linux		       *
*	Copyright (C) 2006 Marcel Ebmer					       *
*									       *
*	This program is free software; you can redistribute it and/or	       *
*	modify it under the terms of the GNU General Public License	       *
*	as published by the Free Software Foundation; either version 2	       *
*	of the License, or (at your option) any later version.		       *
*									       *
*	This program is distributed in the hope that it will be useful,	       *
*	but WITHOUT ANY WARRANTY; without even the implied warranty of	       *
*	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the	       *
*	GNU General Public License for more details.			       *
*									       *
*	You should have received a copy of the GNU General Public License      *
*	along with this program; if not, write to the Free Software	       *
*	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 	       *
*	MA  02110-1301, USA.						       *
*******************************************************************************/
#include "fische.h"

void blur ( int vectortype ) {

#ifndef MMX
	
	register char REG__vX, REG__vY;
	signed register char *REG__vbytes;
	int x, y;
	
	unsigned register char *REG__pixels = sdl_screen->pixels;
	const register int REG__sp_2 = 2 * surfacepitch;
	const register int REG__sp_1 = surfacepitch;
	const register int REG__xres = XRes;
	const register int REG__yres = YRes;
	unsigned register char *REG__bd = blur_dest;

	unsigned register char *REG__dest_bytes;
	unsigned register char *REG__source_bytes;
	
	unsigned register char REG__newbyte;

	bzero ( REG__bd, REG__yres * REG__sp_1 );

	REG__vbytes = vectors + vectortype * vector_fieldsize;

	for ( x = 2; x < REG__xres - 2; x++ ){
		for ( y = 2; y < REG__yres - 2; y++ ){
			
			REG__vX = *( REG__vbytes + y * REG__xres * 2 + x * 2 + 0 );
			REG__vY = *( REG__vbytes + y * REG__xres * 2 + x * 2 + 1 );
			
			REG__dest_bytes = REG__bd + y * REG__sp_1 + x * 4;
			asm ( "prefetchw %0" : : "m" (*REG__dest_bytes) );
			REG__source_bytes = REG__pixels + ( y + REG__vY ) * REG__sp_1 + ( x + REG__vX ) * 4;
			asm ( "prefetch %0" : : "m" (*REG__source_bytes) );
			
			REG__newbyte = 0;
			REG__newbyte += ( *REG__source_bytes * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes - 8 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes + 8 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes + REG__sp_2 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes - REG__sp_2 ) * 51 ) >> 8;
			*REG__dest_bytes = REG__newbyte;

			REG__newbyte = 0;
			REG__dest_bytes++;
			REG__newbyte += ( *( REG__source_bytes - 7 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes + 9 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes + 1 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes + REG__sp_2 + 1 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes - REG__sp_2 + 1 ) * 51 ) >> 8;
			*REG__dest_bytes = REG__newbyte;

			REG__newbyte = 0;
			REG__dest_bytes++;
			REG__newbyte += ( *( REG__source_bytes + 2 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes - 6 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes + 10 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes + REG__sp_2 + 2 ) * 51 ) >> 8;
			REG__newbyte += ( *( REG__source_bytes - REG__sp_2 + 2 ) * 51 ) >> 8;
			*REG__dest_bytes = REG__newbyte;
		}
	}
	memcpy ( REG__pixels, REG__bd, REG__yres * REG__sp_1 );

#endif
#ifdef MMX
	register char REG__vX, REG__vY;
	signed register char *REG__vbytes;
	int x, y;
	
	unsigned register char *REG__pixels = sdl_screen->pixels;
	const register int REG__xres_2 = 2 * XRes;
	const register int REG__sp_1 = surfacepitch;
	const register int REG__xres = XRes;
	const register int REG__yres = YRes;
	unsigned register char *REG__bd = blur_dest;

	register u_int32_t *REG__dest_pixels;
	register u_int32_t *REG__source_pixels;
	
	__m64 REG__newpixel, REG__pix1, REG__pix2, REG__pix3, REG__pix4, REG__pix5;

	bzero ( REG__bd, REG__yres * REG__sp_1 );

	REG__vbytes = vectors + vectortype * vector_fieldsize;

	for ( x = 2; x < REG__xres - 2; x++ ){
		for ( y = 2; y < REG__yres - 2; y++ ){

			REG__vX = *( REG__vbytes + y * REG__xres * 2 + x * 2 + 0 );
			REG__vY = *( REG__vbytes + y * REG__xres * 2 + x * 2 + 1 );
			
			REG__dest_pixels = (u_int32_t*)REG__bd + y * REG__xres + x;

#ifdef SSE
			asm ( "prefetchw %0" : : "m" (*REG__dest_pixels) );
#endif
			REG__source_pixels = (u_int32_t*)REG__pixels + ( y + REG__vY ) * REG__xres + ( x + REG__vX );
#ifdef SSE
			asm ( "prefetch %0" : : "m" (*REG__source_pixels) );
#endif

			REG__pix1 = _m_from_int ( *( REG__source_pixels ) );
			REG__pix1 = _mm_unpacklo_pi8 ( REG__pix1, _mm_setzero_si64 ( ) );
			REG__pix2 = _m_from_int ( *( REG__source_pixels + 2 ) );
			REG__pix2 = _mm_unpacklo_pi8 ( REG__pix2, _mm_setzero_si64 ( ) );
			REG__pix3 = _m_from_int ( *( REG__source_pixels - 2 ) );
			REG__pix3 = _mm_unpacklo_pi8 ( REG__pix3, _mm_setzero_si64 ( ) );
			REG__pix4 = _m_from_int ( *( REG__source_pixels - REG__xres_2 ) );
			REG__pix4 = _mm_unpacklo_pi8 ( REG__pix4, _mm_setzero_si64 ( ) );
			REG__pix5 = _m_from_int ( *( REG__source_pixels + REG__xres_2 ) );
			REG__pix5 = _mm_unpacklo_pi8 ( REG__pix5, _mm_setzero_si64 ( ) );

			REG__newpixel = _mm_adds_pu16 ( REG__pix1, REG__pix2 );
			REG__newpixel = _mm_adds_pu16 ( REG__newpixel, REG__pix3 );
			REG__newpixel = _mm_adds_pu16 ( REG__newpixel, REG__pix4 );
			REG__newpixel = _mm_adds_pu16 ( REG__newpixel, REG__pix5 );

			REG__newpixel = _mm_mullo_pi16 ( REG__newpixel, (__m64) 0x0032003200320032 );

			REG__newpixel = _mm_srl_pi16 ( REG__newpixel, _m_from_int ( 8 ) );

			REG__newpixel = _mm_or_si64 ( REG__newpixel, (__m64) 0xffff000000000000 );

			*REG__dest_pixels = _m_to_int ( _mm_packs_pi16 ( REG__newpixel, _mm_setzero_si64 ( ) ) );
		}
	}
	_mm_empty ();
	memcpy ( REG__pixels, REG__bd, REG__yres * REG__sp_1 );
#endif
}
