#include <stdio.h>
#include "cutil_inline.h"
#include "functions.h"
#include "integration.h"

/**
 * Compute sum reduction on CPU
 * Kahan summation for an accurate sum of large arrays
 * http://en.wikipedia.org/wiki/Kahan_summation_algorithm
 */
template<class T>
T reduceCPU(T *data, int size)
{
    T sum = data[0];
    T c = (T)0.0;
    for (int i = 1; i < size; i++)
    {
        T y = data[i] - c;               // next value to add minus compensation
        T t = sum + y;                         // summation we want to calculate
        c = (t - sum) - y;                // calculate compensation for next run
        sum = t;
    }
    return sum;
}

/**
 * This function calculates the 2D integration on the CPU
 */
template <typename T, typename F>
T integrate2d_cpu(T start, T stop, unsigned int subdivisions,
										  unsigned int blocks, unsigned int threads)
{
	T delta = (stop - start)/subdivisions;
	T result = 0.0;
	T c = T(0.0);
	for(unsigned int i=0; i<subdivisions; i++)
	{
		T add = (F()(start) * delta);
    T y = add - c;
    T t = result + y;
    c = (t - result) - y;
    result = t;
		start += delta;
	}
	return result;
}

/**
 * This function calculates the 2D integration on the GPU
 */
template <typename T, typename F>
T integrate2d_gpu(T start, T stop, unsigned int subdivisions,
									unsigned int blocks, unsigned int threads)
{
	T delta = (stop - start)/subdivisions;
	unsigned int iterations = subdivisions / blocks / threads;

															         // allocate memory for output data on cpu
	T* outdata = (T*) malloc(blocks*threads*sizeof(T));
	                                          // allocate memory for output on gpu
  T* d_outdata = NULL;
  cutilSafeCallNoSync( cudaMalloc((void**) &d_outdata,
		blocks*threads*sizeof(T)) );

	integrate2d<T, F>(blocks, threads, start, delta, iterations, d_outdata);

	cutilCheckMsg("Kernel execution failed");
	cudaThreadSynchronize();
																												   // copy data from gpu
	cutilSafeCallNoSync( cudaMemcpy( outdata, d_outdata,
		blocks*threads*sizeof(T), cudaMemcpyDeviceToHost) );
	cutilSafeCallNoSync(cudaFree(d_outdata));
	T result = reduceCPU<T>(outdata, blocks*threads);		              // reduction

	free(outdata);
	return result;
}

int main (void)
{
	printf("Integration Project\n");

	float result_gpu = 0.0;
	float result_cpu = 0.0;
	double result_cpud = 0.0;

	result_gpu = integrate2d_gpu<float, x_squared_functor<float> >
		(-100.0, 100.0, 100*128*32, 128, 32);
	result_cpu = integrate2d_cpu<float, x_squared_functor<float> >
		(-100.0, 100.0, 100*128*32, 128, 32);
	result_cpud = integrate2d_cpu<double, x_squared_functor<double> >
		(-100.0, 100.0, 100*128*32, 128, 32);

	printf("result_gpu\t %f\nresult_cpu\t %f\nresult_cpud\t %f\n",
		result_gpu, result_cpu, result_cpud);
	cudaThreadExit();
}
