/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and 
 * proprietary rights in and to this software and related documentation. 
 * Any use, reproduction, disclosure, or distribution of this software 
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 * associated with this source code for terms and conditions that govern 
 * your use of this NVIDIA software.
 * 
 */



////////////////////////////////////////////////////////////////////////////////
// Common definition
////////////////////////////////////////////////////////////////////////////////
#define HISTOGRAM256_BIN_COUNT 256

#define      UINT_BITS 32U
#define LOG2_WARP_SIZE 5U
#define      WARP_SIZE (1U << LOG2_WARP_SIZE)

//Warps ==subhistograms per work-group
#define WARP_COUNT 6

//Workgroup size
#define HISTOGRAM256_WORKGROUP_SIZE (WARP_COUNT * WARP_SIZE)

//Local memory per workgroup
#define HISTOGRAM256_WORKGROUP_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)



////////////////////////////////////////////////////////////////////////////////
// Main computation pass: compute per-workgroup partial histograms
////////////////////////////////////////////////////////////////////////////////
#define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U )

inline void addByte(volatile __local uint *l_WarpHist, uint data, uint tag){
    uint count;
    do{
        count = l_WarpHist[data] & TAG_MASK;
        count = tag | (count + 1);
        l_WarpHist[data] = count;
    }while(l_WarpHist[data] != count);
}

inline void addWord(volatile __local uint *l_WarpHist, uint data, uint tag){
    addByte(l_WarpHist, (data >>  0) & 0xFFU, tag);
    addByte(l_WarpHist, (data >>  8) & 0xFFU, tag);
    addByte(l_WarpHist, (data >> 16) & 0xFFU, tag);
    addByte(l_WarpHist, (data >> 24) & 0xFFU, tag);
}

__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_WORKGROUP_SIZE, 1, 1)))
void histogram256(
    __global uint *d_PartialHistograms,
    __global uint *d_Data,
    uint dataCount
){
    //Per-warp substorage storage
    __local uint l_Hist[WARP_COUNT * HISTOGRAM256_BIN_COUNT];
    __local uint *l_WarpHist = l_Hist + (get_local_id(0) >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;

    //Clear shared memory storage for current threadblock before processing
    for(uint i = 0; i < (HISTOGRAM256_BIN_COUNT / WARP_SIZE); i++)
        l_Hist[get_local_id(0) + i  * (WARP_COUNT * WARP_SIZE)] = 0;

    const uint tag =  get_local_id(0) << (32 - LOG2_WARP_SIZE);

    //Read through the entire input buffer, build per-warp histograms
    barrier(CLK_LOCAL_MEM_FENCE);
    for(uint pos = get_global_id(0); pos < dataCount; pos += get_global_size(0)){
        uint data = d_Data[pos];
        addWord(l_WarpHist, data, tag);
    }

    //Per-block histogram reduction
    barrier(CLK_LOCAL_MEM_FENCE);
    for(uint pos = get_local_id(0); pos < HISTOGRAM256_BIN_COUNT; pos += HISTOGRAM256_WORKGROUP_SIZE){
        uint sum = 0;

        for(uint i = 0; i < WARP_COUNT; i++)
            sum += l_Hist[pos + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;

        d_PartialHistograms[get_group_id(0) * HISTOGRAM256_BIN_COUNT + pos] = sum;
    }
}



////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output
// Run one workgroup per bin; each workgroup adds up the same bin counter 
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
#define MERGE_WORKGROUP_SIZE 256

__kernel void mergeHistogram256(
    __global uint *d_Histogram,
    __global uint *d_PartialHistograms,
    uint histogramCount
){
    __local uint l_Data[MERGE_WORKGROUP_SIZE];

    uint sum = 0;
    for(uint i = get_local_id(0); i < histogramCount; i += MERGE_WORKGROUP_SIZE)
        sum += d_PartialHistograms[get_group_id(0) + i * HISTOGRAM256_BIN_COUNT];
    l_Data[get_local_id(0)] = sum;

    for(uint stride = MERGE_WORKGROUP_SIZE / 2; stride > 0; stride >>= 1){
        barrier(CLK_LOCAL_MEM_FENCE);
        if(get_local_id(0) < stride)
            l_Data[get_local_id(0)] += l_Data[get_local_id(0) + stride];
    }

    if(get_local_id(0) == 0)
        d_Histogram[get_group_id(0)] = l_Data[0];
}