写在前面
这个程序再《CUDA by Example》所给出的程序代码是有问题的,原先的例子代码将所有的数据加到直方图数组的第一个元素上,其他数字为零,至少我的是这样的,所以我就自己写了一个,并且优化了使用cpu初始化原始数据的效率(STL的多线程)。
代码在这里
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cstdio>
#include<iostream>
#include<thread>
#include<vector>
static const int SIZE_DATA = 100 * 1024 * 1024;
unsigned char * h_getrndarray(int size)
{
const auto cpu_maxthread = std::thread::hardware_concurrency();
unsigned char *buffer = (unsigned char*)malloc(size*sizeof(unsigned char));
std::vector<std::thread> thread_vec;
const auto cpusize = size / cpu_maxthread;
auto h_thread_getrnd = [](unsigned char *data, int start, int size)
{
for (auto i = start; i < start + size; ++i)
if (i < SIZE_DATA)
data[i] = rand();
};
for (size_t i{}; i < cpu_maxthread; ++i)
thread_vec.push_back(std::move(std::thread(h_thread_getrnd, buffer, i*cpusize, cpusize)));
for (auto &i : thread_vec)
i.join();
return buffer;
}
__global__ void histo_kernel(unsigned char* data, size_t*d_histo, long size)
{
__shared__ unsigned int temp[256];
temp[threadIdx.x] = 0;
__syncthreads();
auto i = threadIdx.x + blockIdx.x * blockDim.x;
auto stride = blockDim.x * gridDim.x;
while (i < size)
{
atomicAdd(&temp[data[i]], 1);
i += stride;
}
__syncthreads();
atomicAdd(&(d_histo[threadIdx.x]), temp[threadIdx.x]);
}
int main()
{
const auto buffer_sizeof = sizeof(unsigned char)*SIZE_DATA;
const auto histo_sizeof = sizeof(size_t) * 256;
unsigned char *buffer = h_getrndarray(SIZE_DATA);
size_t histo[256] = {};
unsigned char *d_buffer;
size_t *d_histo;
cudaMalloc(&d_buffer, buffer_sizeof);
cudaMemcpy(d_buffer, buffer, buffer_sizeof, cudaMemcpyHostToDevice);
cudaMalloc(&d_histo, histo_sizeof);
cudaMemset(d_histo, 0, histo_sizeof); //初始化
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
auto blocks = prop.multiProcessorCount;
histo_kernel << <blocks *2, 256 >> >(d_buffer, d_histo, SIZE_DATA);
cudaMemcpy(histo, d_histo, histo_sizeof, cudaMemcpyDeviceToHost);
for (char i = 0; i < 256; ++i)
std::cout << i << " : " << histo[i] <<" times."<< std::endl;
//cpu版***********
for (size_t i{}; i < SIZE_DATA; ++i)
{
++histo[buffer[i]];
}
for (auto &i : histo)
std::cout << i << ", " << std::endl;
//***********//
free(buffer);
}
这是部分的结果: