/*****************************************************************
* Unipro UGENE - Integrated Bioinformatics Suite
* Copyright (C) 2008,2009 Unipro, Russia (http://ugene.unipro.ru)
* All Rights Reserved
* 
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/

//#ifdef SW2_BUILD_WITH_CUDA

#include <stdlib.h>
#include <stdio.h>

#include "ResultType.h"
#include "CList.h"

typedef int ScoreType;

void printRow(ScoreType* row, int N) {
	
	for (int i = 0; i < N; i++) {
		printf("%i" , row[i]);
		printf(" ");		
	}
	printf("\n");
}

__global__ void calculateMatrix(const char * seqLib, int seqLibLength, ScoreType* queryProfile, 
								int queryLength, ScoreType gapOpen, ScoreType gapExtension, 
								ScoreType* g_HdataPrev, ScoreType* g_HdataCur, ScoreType* g_HdataRec,
								ScoreType* g_EdataCur, ScoreType* g_EdataRec,
								ScoreType* g_Fdata, ScoreType* g_directionsPrev, 
								ScoreType* g_directionsCur, ScoreType* g_directionsRec,
								int maxScore, bool * g_isActual, int iteration) {

	int i = blockIdx.x * blockDim.x + threadIdx.x;

	int j = iteration - i;	

	ScoreType substScore = 0;
	if (j >= 0 && j < seqLibLength)
		substScore = queryProfile[seqLib[j] * queryLength + i];

	ScoreType r_H = g_HdataCur[i - 1];
	ScoreType r_He = g_HdataPrev[i - 1];
	ScoreType r_Hf = g_HdataPrev[i];
	ScoreType r_E = g_EdataCur[i - 1];
	ScoreType r_F = g_Fdata[i];	

	r_E = max(r_E - gapExtension, r_He - gapOpen);				

	r_F = max(r_F - gapExtension, r_Hf - gapOpen);				

	r_H += substScore;
	r_H = max(r_H, 0);				
	r_H = max(r_H, r_E);		
	r_H = max(r_H, r_F);		

	if (i == 0) g_directionsCur[-1] = iteration;	

	g_EdataRec[i] = r_E;
	g_Fdata[i] = r_F;
	g_HdataRec[i] = r_H;		

	ScoreType direction = -1;

	if (r_H == 0) {		 
		direction = j + 1;
	}
	else if (r_H == r_E) {
		direction = g_directionsPrev[i - 1];
	}
	else if (r_H == r_F) {
		direction = g_directionsPrev[i];	
	}
	else {
		direction = g_directionsCur[i - 1];
	}			

	g_directionsRec[i]  = direction;	

	if (r_H >= maxScore && j < seqLibLength) {
		*g_isActual  = true;
	}	
}

CList<resType>* calculateOnGPU(const char * seqLib, int seqLibLength, ScoreType* queryProfile, ScoreType qProfLen, int queryLength, ScoreType gapOpen, ScoreType gapExtension, ScoreType maxScore) {	

	//vector size = queryLength plus overriding left and right boundaries
	int sizeRow = ( 3 * queryLength );

//************************** declare some temp variables on host

	bool* g_isActual;	
	bool * isActualArr = new bool;
	*isActualArr = false;

	ScoreType* tempRow = new ScoreType[sizeRow];	
	ScoreType* zerroArr = new ScoreType[sizeRow];	
	for (int i = 0; i < sizeRow; i++) zerroArr[i] = 0;

	ScoreType* directionRow = new ScoreType[sizeRow];

//************************** sizes of arrays

	size_t sizeQ = sizeRow * sizeof(ScoreType);	
	size_t sizeQQ = (sizeRow - queryLength) * sizeof(ScoreType);	
	size_t sizeP = qProfLen * sizeof(ScoreType);
	size_t sizeL = (seqLibLength) * sizeof(char);
	size_t sizeB = 1 * sizeof(bool);

//************************** declare arrays on device

	char * g_seqLib;
	ScoreType* g_queryProfile;
	ScoreType* g_HdataPrev;
	ScoreType* g_HdataCur;
	ScoreType* g_HdataRec;
	ScoreType* g_HdataTmp;
	ScoreType* g_HdataTmp2;
	ScoreType* g_EdataCur;
	ScoreType* g_EdataRec;
	ScoreType* g_Fdata;
	ScoreType* g_directionsPrev;
	ScoreType* g_directionsCur;		
	ScoreType* g_directionsRec;	

//************************** allocate global memory on device
	
	cudaMalloc((void **)& g_seqLib, sizeL);
	cudaMalloc((void **)& g_queryProfile, sizeP);
	cudaMalloc((void **)& g_HdataPrev, sizeQ);
	cudaMalloc((void **)& g_HdataCur, sizeQ);
	cudaMalloc((void **)& g_EdataCur, sizeQ);
	cudaMalloc((void **)& g_EdataRec, sizeQ);
	cudaMalloc((void **)& g_Fdata, sizeQ);
	cudaMalloc((void **)& g_directionsPrev, sizeQ);
	cudaMalloc((void **)& g_directionsCur, sizeQ);
	cudaMalloc((void **)& g_HdataRec, sizeQ);
	cudaMalloc((void **)& g_directionsRec, sizeQ);	
	cudaMalloc((void **)& g_isActual, sizeB);		

//************************** copy from host to device			
	
	cudaMemcpy(g_seqLib, seqLib, sizeL, cudaMemcpyHostToDevice);	
	cudaMemcpy(g_queryProfile, queryProfile, sizeP, cudaMemcpyHostToDevice);	
	cudaMemcpy(g_HdataPrev, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_HdataCur, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_EdataCur, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_EdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_Fdata, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_directionsPrev, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_directionsCur, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_directionsRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);		
	cudaMemcpy(g_HdataRec, zerroArr, sizeQ, cudaMemcpyHostToDevice);
	cudaMemcpy(g_isActual, isActualArr, sizeB, cudaMemcpyHostToDevice);		

//************************** start calculation

	int BLOCK_SIZE = queryLength;
	
	if (queryLength > 10 && queryLength < 200) {
		BLOCK_SIZE = queryLength / 2;
	}
	else if (queryLength >= 200) BLOCK_SIZE = 100;

	dim3 dimBlock(BLOCK_SIZE);	
	dim3 dimGrid((queryLength + dimBlock.x - 1)  / dimBlock.x);	

	CList<resType>* pas = new CList<resType>;	

// shift arrays
	g_HdataPrev += queryLength;
	g_HdataCur += queryLength;
	g_HdataRec += queryLength;
	g_EdataCur += queryLength;
	g_EdataRec += queryLength;
	g_Fdata += queryLength;
	g_directionsPrev += queryLength;
	g_directionsCur += queryLength;
	g_directionsRec += queryLength;

// start main loop
	for (int i = 0; i < (seqLibLength + queryLength - 1); i++) {			

// isActualArr variable indicate that vector contain actual result
		*isActualArr = false;
		cudaMemcpy(g_isActual, isActualArr, sizeB, cudaMemcpyHostToDevice);

		calculateMatrix<<<dimGrid, dimBlock>>>(g_seqLib, seqLibLength, 
			g_queryProfile, queryLength, gapOpen, 
			gapExtension, g_HdataPrev, g_HdataCur, 
			g_HdataRec,	g_EdataCur, g_EdataRec, g_Fdata, 
			g_directionsPrev, g_directionsCur,
			g_directionsRec, maxScore, g_isActual, i);

		cudaThreadSynchronize();
		
		cudaMemcpy(isActualArr, g_isActual, sizeB, cudaMemcpyDeviceToHost);
		//Print matrix
// 		cudaMemcpy(tempRow, g_directionsRec + queryLength, sizeQQ, cudaMemcpyDeviceToHost);
// 		printRow(tempRow, queryLength);

		if (*isActualArr) {			

//Copy vector on host and find actual results
			cudaMemcpy(tempRow, g_HdataRec, sizeQQ, cudaMemcpyDeviceToHost);
			cudaMemcpy(directionRow, g_directionsRec, sizeQQ, cudaMemcpyDeviceToHost);

			for (int j = 0; j < queryLength; j++) {
				resType res;
				if (tempRow[j] >= maxScore && i - j < seqLibLength) {					
					res.reg.startPos = directionRow[j];
					res.reg.len = i - j - res.reg.startPos + 1;
					res.score = tempRow[j];
					pas->AddEnd(res);					
				}
			}
		}

//revering arrays H, E and directions
		g_HdataTmp = g_HdataCur;
		g_HdataCur = g_HdataPrev;
		g_HdataTmp2 = g_HdataRec;
		g_HdataRec = g_HdataTmp;
		g_HdataPrev = g_HdataTmp2;

		g_HdataTmp = g_directionsCur;
		g_directionsCur = g_directionsPrev;
		g_HdataTmp2 = g_directionsRec;
		g_directionsRec = g_HdataTmp;
		g_directionsPrev = g_HdataTmp2;

		g_HdataTmp = g_EdataCur;
		g_EdataCur = g_EdataRec;
		g_EdataRec = g_HdataTmp;
	}

//************************** end calculation

	//Print results
// 	pas->SetFrontPos();
// 	for (int i = 0; i < pas->GetSize(); i++) {
// 		resType res = pas->GetNext();
// 		printf("%i", res.score);
// 		printf(",");
// 		printf("%i", res.reg.startPos);
// 		printf(",");
// 		printf("%i", (res.reg.len + res.reg.startPos));
// 		printf("\n");
// 
// 	}

// turn back arrays
	g_HdataPrev -= queryLength;
	g_HdataCur -= queryLength;
	g_HdataRec -= queryLength;
	g_EdataCur -= queryLength;
	g_EdataRec -= queryLength;
	g_Fdata -= queryLength;
	g_directionsPrev -= queryLength;
	g_directionsCur -= queryLength;
	g_directionsRec -= queryLength;

//************************** free memory

	cudaFree(g_seqLib);
	cudaFree(g_queryProfile);
	cudaFree(g_HdataPrev);
	cudaFree(g_HdataCur);
	cudaFree(g_HdataRec);
	cudaFree(g_EdataCur);
	cudaFree(g_EdataRec);
	cudaFree(g_Fdata);
	cudaFree(g_directionsPrev);
	cudaFree(g_directionsCur);
	cudaFree(g_directionsRec);
	cudaFree(g_isActual);	

	delete[] tempRow;	
	delete[] directionRow;
	delete isActualArr;

	return pas;
}

//#endif //SW2_BUILD_WITH_CUDA