#ifndef INTEGRATION_KERNEL_H
#define INTEGRATION_KERNEL_H

#include <stdio.h>
#include "functions.h"

/**
 * Kernel for 2D function integration
 */
template <typename T, typename F>
__global__ void integrate2d_kernel(T start, T delta, 
														unsigned int iterations, T * result)
{
    unsigned int tid = threadIdx.x;
    unsigned int bid = blockIdx.x;

    // initialize first thread position
		T thread_position = start + (delta * blockDim.x * iterations * bid) + 
			(delta * tid * iterations);
		T thread_result = 0.0;

		// calculate integral in the specified range
		for(unsigned int i=0; i<iterations; i++)
		{
			thread_result += (F()(thread_position) * delta);
			thread_position += delta;
		}
		
		// put result to global memory
		result[bid*blockDim.x + tid] = thread_result;
}

/**
 * Wrapper function for launching 2D function integration kernel
 */
template <typename T, typename F>
void integrate2d(unsigned int blocks, unsigned int threads, 
								 T start, T delta, unsigned int iterations, T * result)
{
  dim3 dimBlock(threads, 1, 1);
  dim3 dimGrid(blocks, 1, 1); 
	integrate2d_kernel<T, F><<< dimGrid, dimBlock >>>
		(start, delta, iterations, result);
}

/**
 * Benchmark function for launching 2D function integration kernel
 */
template <typename T, typename F>
void bench_integrate2d(unsigned int blocks, unsigned int threads, 
											T start, T delta, unsigned int iterations, T * result,
											int runs)
{
  dim3 dimBlock(threads, 1, 1);
  dim3 dimGrid(blocks, 1, 1);
																			     // launch the kernel once for warmup
	integrate2d_kernel<T, F><<< dimGrid, dimBlock >>>
		(start, delta, iterations, result);

	cudaEvent_t e_start, e_stop; 
	cudaEventCreate(&e_start); 
	cudaEventCreate(&e_stop);
	cudaEventRecord(e_start, 0);
	for(int i=0; i<runs; i++)
	{
		integrate2d_kernel<T, F><<< dimGrid, dimBlock >>>
			(start, delta, iterations, result);
	}
	cudaEventRecord(e_stop, 0); 
	cudaEventSynchronize(e_stop);
	float elapsedTime; 
	cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
	printf("calls %d time %1.2fms\n", runs, elapsedTime);
}

template 
void integrate2d<float, x_squared_functor<float>>(
	unsigned int blocks, unsigned int threads, 
	float start, float delta, unsigned int iterations, float * result);

template
void bench_integrate2d<float, x_squared_functor<float>>(
	unsigned int blocks, unsigned int threads, 
	float start, float delta, unsigned int iterations, float * result, 
	int runs);

template 
void integrate2d<float, x_sqrtf_functor<float>>(
	unsigned int blocks, unsigned int threads, 
	float start, float delta, unsigned int iterations, float * result);

template
void bench_integrate2d<float, x_sqrtf_functor<float>>(
	unsigned int blocks, unsigned int threads, 
	float start, float delta, unsigned int iterations, float * result, 
	int runs);

template 
void integrate2d<float, x_complex_functor<float>>(
	unsigned int blocks, unsigned int threads, 
	float start, float delta, unsigned int iterations, float * result);

template
void bench_integrate2d<float, x_complex_functor<float>>(
	unsigned int blocks, unsigned int threads, 
	float start, float delta, unsigned int iterations, float * result, 
	int runs);

#endif // #ifndef INTEGRATION_KERNEL_H
