#include <stdio.h>
#include "cutil_inline.h"
#include "functions.h"
#include "integration.h"

/**
 * Compute sum reduction on CPU
 * Kahan summation for an accurate sum of large arrays
 * http://en.wikipedia.org/wiki/Kahan_summation_algorithm
 */
template<class T>
T reduceCPU(T *data, int size)
{
    T sum = data[0];
    T c = (T)0.0;              
    for (int i = 1; i < size; i++)
    {
        T y = data[i] - c; // next value to add minus compensation
        T t = sum + y;     // summation we want to calculate 
        c = (t - sum) - y; // calculate compensation for next run
        sum = t;            
    }
    return sum;
}

/**
 * This function simulates the GPU 2d integration
 */
template <typename T, typename F>
T integrate2d_cpu_sim(T start, T stop, unsigned int subdivisions, 
										  unsigned int blocks, unsigned int threads)
{
	T delta = (stop - start)/subdivisions;
	unsigned int iterations = subdivisions / blocks / threads;

																						   // allocate memory for buffering
	T* outdata = (T*) malloc(blocks*threads*sizeof(T));
	T pos = start;
	for(unsigned int b=0; b<blocks; b++)              // simulate GPU calculation
	{
		for(unsigned int t=0; t<threads; t++)
		{
			T r = 0.0;
			for(unsigned int i=0; i<iterations; i++)
			{
				r += (F()(pos) * delta);
				pos += delta;
			}
			outdata[b*threads + t] = r;
		}
	}

	T result = reduceCPU<T>(outdata, blocks*threads);		             // reduction
	
	free(outdata);
	return result;
}

/**
 * This function calculates the 2D integration on the GPU
 * It gives different results 
 */
template <typename T, typename F>
T integrate2d_cpu(T start, T stop, unsigned int subdivisions, 
										  unsigned int blocks, unsigned int threads)
{
	T delta = (stop - start)/subdivisions;
	T result = 0.0;
	T c = T(0.0);
	for(unsigned int i=0; i<subdivisions; i++)
	{
		T add = (F()(start) * delta);
    T y = add - c;        // next value to add minus compensation
    T t = result + y;     // summation we want to calculate 
    c = (t - result) - y; // calculate compensation for next run
    result = t;  
		start += delta;
	}
	return result;
}

template <typename T, typename F>
T integrate2d_gpu(T start, T stop, unsigned int subdivisions, 
									unsigned int blocks, unsigned int threads)
{
	T delta = (stop - start)/subdivisions;
	unsigned int iterations = subdivisions / blocks / threads;

															        // allocate memory for output data on cpu
	T* outdata = (T*) malloc(blocks*threads*sizeof(T));
	                                         // allocate memory for output on gpu
  T* d_outdata = NULL;
  cutilSafeCallNoSync( cudaMalloc((void**) &d_outdata, 
		blocks*threads*sizeof(T)) );

	integrate2d<T, F>(blocks, threads, start, delta, iterations, d_outdata);

	cutilCheckMsg("Kernel execution failed");
	cudaThreadSynchronize();
																												  // copy data from gpu
	cutilSafeCallNoSync( cudaMemcpy( outdata, d_outdata, 
		blocks*threads*sizeof(T), cudaMemcpyDeviceToHost) );
	cutilSafeCallNoSync(cudaFree(d_outdata));
	T result = reduceCPU<T>(outdata, blocks*threads);		             // reduction
	
	free(outdata);
	return result;
}

template <typename T, typename F>
T bench_integrate2d_gpu(T start, T stop, unsigned int subdivisions, 
									unsigned int blocks, unsigned int threads, int runs)
{
	T delta = (stop - start)/subdivisions;
	unsigned int iterations = subdivisions / blocks / threads;
	printf("%d %f\n", iterations, delta);
															        // allocate memory for output data on cpu
	T* outdata = (T*) malloc(blocks*threads*sizeof(T));
	                                         // allocate memory for output on gpu
  T* d_outdata = NULL;
  cutilSafeCallNoSync( cudaMalloc((void**) &d_outdata, 
		blocks*threads*sizeof(T)) );

	bench_integrate2d<T, F>
		(blocks, threads, start, delta, iterations, d_outdata, runs);

	cutilCheckMsg("Kernel execution failed");
	cudaThreadSynchronize();
																												  // copy data from gpu
	cutilSafeCallNoSync( cudaMemcpy( outdata, d_outdata, 
		blocks*threads*sizeof(T), cudaMemcpyDeviceToHost) );
	cutilSafeCallNoSync(cudaFree(d_outdata));
	T result = reduceCPU<T>(outdata, blocks*threads);		             // reduction
	
	free(outdata);
	return result;
}



int main (void)
{
	printf("hello world!\n");
	float integral = 0, integral2 = 0, integral3=0;
	double integral4;
	integral = 
		integrate2d_cpu_sim<float, x_squared_functor<float>>(-100.0, 100.0, 100*128*32, 128, 32);
	integral2 = integrate2d_gpu<float, x_squared_functor<float>>(-100.0, 100.0, 100*128*32, 128, 32);
	integral3 = integrate2d_cpu<float, x_squared_functor<float>>(-100.0, 100.0, 100*128*32, 128, 32);
	integral4 = integrate2d_cpu<double, x_squared_functor<double>>(-100.0, 100.0, 100*128*32, 128, 32);
	printf("cpu_sim %f gpu %f cpu_float %f cpu_double %f\n", integral, integral2, integral3, integral4);
	for(unsigned short int i=0; i<5; i++)
	{
		bench_integrate2d_gpu<float, x_squared_functor<float>>(-100.0, 100.0, 100*128*32, 128, 32, 
			(int)pow(10.0, (double)i));
	}

	for(unsigned short int i=0; i<5; i++)
	{
		bench_integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 128, 32, 
			(int)pow(10.0, (double)i));
	}
	printf("___\n");
	float t;
	t = bench_integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 128, 32, 1000);
	printf("%f\n", t);
	t = bench_integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 64, 64, 1000);
	printf("%f\n", t);
	t = bench_integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 320, 128, 1000);
	printf("%f\n", t);
	t = bench_integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 128, 32, 1000);
	printf("%f\n", t);
	t = bench_integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 64, 64, 1000);
	printf("%f\n", t);
	t = bench_integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 100, 8, 1000);
	printf("%f\n", t);
	printf("___\n");

	integral = 
		integrate2d_cpu_sim<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 128, 32);
	integral2 = integrate2d_gpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 128, 32);
	integral3 = integrate2d_cpu<float, x_sqrtf_functor<float>>(0.0, 100.0, 100*128*32, 128, 32);
	integral4 = integrate2d_cpu<double, x_sqrtf_functor<double>>(0.0, 100.0, 100*128*32, 128, 32);
	printf("cpu_sim %f gpu %f cpu_float %f cpu_double %f\n", integral, integral2, integral3, integral4);

	integral = 
		integrate2d_cpu_sim<float, x_complex_functor<float>>(0.0, 3.0, 100*128*32, 128, 32);
	integral2 = integrate2d_gpu<float, x_complex_functor<float>>(0.0, 3.0, 100*128*32, 128, 32);
	integral3 = integrate2d_cpu<float, x_complex_functor<float>>(0.0, 3.0, 100*128*32, 128, 32);
	integral4 = integrate2d_cpu<double, x_complex_functor<double>>(0.0, 3.0, 100*128*32, 128, 32);
	printf("cpu_sim %f gpu %f cpu_float %f cpu_double %f\n", integral, integral2, integral3, integral4);

	getchar();
	cudaThreadExit();
}
