/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and
 * proprietary rights in and to this software and related documentation.
 * Any use, reproduction, disclosure, or distribution of this software
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 * associated with this source code for terms and conditions that govern
 * your use of this NVIDIA software.
 *
 */


/*
    Parallel reduction

    This sample shows how to perform a reduction operation on an array of values
    to produce a single value.

    Reductions are a very common computation in parallel algorithms.  Any time
    an array of values needs to be reduced to a single value using a binary
    associative operator, a reduction can be used.  Example applications include
    statistics computaions such as mean and standard deviation, and image
    processing applications such as finding the total luminance of an
    image.

    This code performs sum reductions, but any associative operator such as
    min() or max() could also be used.

    It assumes the input size is a power of 2.
*/

// Utilities and system includes
#include "cutil_inline.h"
#include <algorithm>

// includes, project
#include "reduction.h"

enum ReduceType
{
    REDUCE_INT
};

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
template <class T>
void runTest( int argc, char** argv, ReduceType datatype);

#define MAX_BLOCK_DIM_SIZE 65535

#ifdef WIN32
#define strcasecmp strcmpi
#endif

extern "C"
bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
    runTest<int>( argc, argv, REDUCE_INT);
    cudaThreadExit();
}

////////////////////////////////////////////////////////////////////////////////
//! Compute sum reduction on CPU
//! We use Kahan summation for an accurate sum of large arrays.
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
//!
//! @param data       pointer to input data
//! @param size       number of input data elements
////////////////////////////////////////////////////////////////////////////////
template<class T>
T reduceCPU(T *data, int size)
{
    T sum = data[0];
    T c = (T)0.0;
    for (int i = 1; i < size; i++)
    {
        T y = data[i] - c;
        T t = sum + y;
        c = (t - sum) - y;
        sum = t;
    }
    return sum;
}

unsigned int nextPow2( unsigned int x ) {
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

#define MIN(x,y) ((x < y) ? x : y)

////////////////////////////////////////////////////////////////////////////////
// Compute the number of threads and blocks to use for the given reduction kernel
// We set to the minimum of maxThreads and n.
////////////////////////////////////////////////////////////////////////////////
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
	threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
  blocks = (n + threads - 1) / threads;
}

////////////////////////////////////////////////////////////////////////////////
// This function performs a reduction of the input data multiple times and
// measures the average reduction time.
////////////////////////////////////////////////////////////////////////////////
template <class T>
T benchmarkReduce(int  n,
                  int  numThreads,
                  int  numBlocks,
                  int  maxThreads,
                  int  maxBlocks,
                  int  whichKernel,
                  int  testIterations,
                  bool cpuFinalReduction,
                  int  cpuFinalThreshold,
                  unsigned int timer,
                  T* h_odata,
                  T* d_idata,
                  T* d_odata)
{
  T gpu_result = 0;
  bool needReadBack = true;

  for (int i = 0; i < testIterations; ++i)
  {
    gpu_result = 0;
    cudaThreadSynchronize();
    cutilCheckError( cutStartTimer( timer));

    // execute the kernel
    reduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata);

    // check if kernel execution generated an error
    cutilCheckMsg("Kernel execution failed");

		// sum partial block sums on GPU
		int s=numBlocks;
		int kernel = whichKernel;
		while(s > cpuFinalThreshold)
		{
			int threads = 0, blocks = 0;
			getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
			reduce<T>(s, threads, blocks, kernel, d_odata, d_odata);
			s = (s + threads - 1) / threads;
		}

		cudaThreadSynchronize();
		cutilCheckError( cutStopTimer(timer) );
  }

	cutilSafeCallNoSync( cudaMemcpy( &gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost) );

	return gpu_result;
}



////////////////////////////////////////////////////////////////////////////////
// The main function which runs the reduction test.
////////////////////////////////////////////////////////////////////////////////
template <class T>
void
runTest( int argc, char** argv, ReduceType datatype)
{
	int size = 1<<20;															// number of elements to reduce
	int maxThreads = 256;													 // number of threads per block
	int whichKernel = 0;
	int maxBlocks = 64;
	bool cpuFinalReduction = false;
	int cpuFinalThreshold = 1;

	// create random input data on CPU
	unsigned int bytes = size * sizeof(T);
	T *h_idata = (T *) malloc(bytes);
	for(int i=0; i<size; i++)
	{
		// Keep the numbers small so we don't get truncation error in the sum
		if (datatype == REDUCE_INT)
				h_idata[i] = (T)(rand() & 0xFF);
		else
				h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
	}

	int numBlocks = 0;
	int numThreads = 0;
	getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);

	// allocate mem for the result on host side
	T* h_odata = (T*) malloc(numBlocks*sizeof(T));

	// allocate device memory and data
	T* d_idata = NULL;
	T* d_odata = NULL;
	cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
	cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );
	// copy data directly to device memory
	cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
	cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );


	// warm-up
	reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
	int testIterations = 100;

	unsigned int timer = 0;
	cutilCheckError( cutCreateTimer( &timer));

	T gpu_result = 0;

	gpu_result = benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
																	whichKernel, testIterations, cpuFinalReduction,
																	cpuFinalThreshold, timer,
																	h_odata, d_idata, d_odata);

	double reduceTime = cutGetAverageTimerValue(timer) * 1e-3;
	printf("Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u\n",
				 1.0e-9 * (size * sizeof(int))/reduceTime, reduceTime, size);

	// compute reference solution
	T cpu_result = reduceCPU<T>(h_idata, size);


	printf("\nGPU result = %d\n", gpu_result);
	printf("CPU result = %d\n\n", cpu_result);
	printf("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");


	// cleanup
	cutilCheckError( cutDeleteTimer(timer) );
	free(h_idata);
	free(h_odata);

	cutilSafeCallNoSync(cudaFree(d_idata));
	cutilSafeCallNoSync(cudaFree(d_odata));
}
