一. 背景
- 有些操作不能被拆分, 否则会引发计算错误.
- 使thread对资源有暂时的”独占性”, 避免计算错误.
二. CPU计算直方图
辅助代码见: http://blog.csdn.net/full_speed_turbo/article/details/71107132
#include "../common/book.h"
#define SIZE (100*1024*1024)
#include <ctime>
clock_t clockBegin, clockEnd;
void PrintfContainerElapseTime(char *pszContainerName, char *pszOperator, long lElapsetime)
{
printf("%s 的 %s操作 用时 %d毫秒\n", pszContainerName, pszOperator, lElapsetime);
}
int main(void)
{
clockBegin = clock();
unsigned char *buffer = (unsigned char*)big_random_block( SIZE );
unsigned int histo[256];
for (int i=0;i<256;i++)
{
histo[i] = 0;
}
for (int i=0;i<SIZE;i++)
{
histo[buffer[i]]++;
}
long histoCount = 0;
for (int i=0; i<256; i++)
{
histoCount += histo[i];
}
printf("Histogram Sum: %1d\n", histoCount);
clockEnd = clock();
//输出时间是ms
PrintfContainerElapseTime("100MB U8数据", "进行直方图", clockEnd - clockBegin);
free(buffer);
return 0;
}
三. GPU global memory 计算直方图
#include "../common/book.h"
#define SIZE (100*1024*1024)
__global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x * gridDim.x;
while( i<size )
{
atomicAdd( &(histo[buffer[i]]), 1 );
i += stride;
}
}
int main(void)
{
//CPU上malloc
unsigned char *buffer = (unsigned char*)big_random_block(SIZE);
//为了记录时间
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// GPU上分配内存
unsigned char *dev_buffer;
unsigned int *dev_histo;
HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_histo, 256*sizeof(int) ) );
HANDLE_ERROR( cudaMemset( dev_histo, 0, 256*sizeof(int) ) );
//根据GPU处理器数量确定block数量
cudaDeviceProp prop;
HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
int blocks = prop.multiProcessorCount;
histo_kernel<<<blocks*2,256>>>( dev_buffer, SIZE, dev_histo );
unsigned int histo[256];
HANDLE_ERROR( cudaMemcpy(histo, dev_histo,
256*sizeof(int),
cudaMemcpyDeviceToHost ) );
//获取用时
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
float elapsedTime;
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );
printf( "Time to generate: %3.1f ms\n", elapsedTime );
long histoCount = 0;
for ( int i=0; i<256; i++)
{
histoCount += histo[i];
}
printf( "Histogram Sum: %1d\n", histoCount );
//验证结果
for (int i=0; i<SIZE; i++)
{
histo[buffer[i]]--;
}
for (int i=0; i<256; i++)
{
if (histo[i] != 0)
{
printf("Failure at %d!\n", i);
}
}
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
cudaFree( dev_buffer );
cudaFree( dev_histo );
free(buffer);
return 0;
}
如果有atomicAdd undefined
错误, VS2008按照下图设置:
四. GPU使用shared memory计算直方图
只修改kernel函数:
1. 每个block有256个thread
2. 每个thread都要先将相应共享内存temp中和threadIdx.x对应的值置0
3. 每个thread统计hist, 步长是线程总数blockDim.x * gridDim.x
4. 每个block有256个thread, 也正好有256个bin. 所以, 每个thread都将相应threadIdx.x的bin加到总的histo上.
5. 注意同步操作, 保证所有thread都计算完成, 再做下一步操作.
__global__ void histo_kernel( unsigned char *buffer,
long size,
unsigned int *histo)
{
__shared__ unsigned int temp[256];
temp[threadIdx.x] = 0;
__syncthreads();
int i = threadIdx.x + blockIdx.x * blockDim.x;
int offset = blockDim.x * gridDim.x;
while( i < size )
{
atomicAdd( &temp[buffer[i]], 1);
i += offset;
}
__syncthreads();
atomicAdd( &histo[threadIdx.x], temp[threadIdx.x] );
}