cuda共享内存与原子操作，实现计算字符直方图

最新推荐文章于 2023-12-05 21:23:21 发布

蒲毛

最新推荐文章于 2023-12-05 21:23:21 发布

阅读量373

点赞数 3

分类专栏： cuda vs2015 c++11 文章标签： cuda c++ 多线程算法

本文链接：https://blog.csdn.net/qq_40084959/article/details/114897440

版权

cuda 同时被 3 个专栏收录

2 篇文章 0 订阅

订阅专栏

vs2015

1 篇文章 0 订阅

订阅专栏

c++11

1 篇文章 0 订阅

订阅专栏

写在前面

这个程序再《CUDA by Example》所给出的程序代码是有问题的，原先的例子代码将所有的数据加到直方图数组的第一个元素上，其他数字为零，至少我的是这样的，所以我就自己写了一个，并且优化了使用cpu初始化原始数据的效率（STL的多线程）。

代码在这里

#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cstdio>
#include<iostream>
#include<thread>
#include<vector>


static const int SIZE_DATA = 100 * 1024 * 1024;

unsigned char * h_getrndarray(int size)
{
	const auto cpu_maxthread = std::thread::hardware_concurrency();
	unsigned char *buffer = (unsigned char*)malloc(size*sizeof(unsigned char));
	std::vector<std::thread> thread_vec;
	const auto cpusize = size / cpu_maxthread;
	auto h_thread_getrnd = [](unsigned char *data, int start, int size)
	{
		for (auto i = start; i < start + size; ++i)
			if (i < SIZE_DATA)
				data[i] = rand();
	};
	for (size_t i{}; i < cpu_maxthread; ++i)
		thread_vec.push_back(std::move(std::thread(h_thread_getrnd, buffer, i*cpusize, cpusize)));
	for (auto &i : thread_vec)
		i.join();
	return buffer;
}

__global__ void histo_kernel(unsigned char* data, size_t*d_histo, long size)
{
	    __shared__  unsigned int temp[256];
	    temp[threadIdx.x] = 0;
	    __syncthreads();
	    auto i = threadIdx.x + blockIdx.x * blockDim.x;
	    auto stride = blockDim.x * gridDim.x;
	    while (i < size)
		{
			atomicAdd(&temp[data[i]], 1);
	        i += stride;
	    }
	    __syncthreads();
		atomicAdd(&(d_histo[threadIdx.x]), temp[threadIdx.x]);
}

int main()
{
	const auto buffer_sizeof = sizeof(unsigned char)*SIZE_DATA;
	const auto histo_sizeof = sizeof(size_t) * 256;
	unsigned char *buffer = h_getrndarray(SIZE_DATA);
	size_t histo[256] = {};
	unsigned char *d_buffer;
	size_t *d_histo;
	cudaMalloc(&d_buffer, buffer_sizeof);
	cudaMemcpy(d_buffer, buffer, buffer_sizeof, cudaMemcpyHostToDevice);
	cudaMalloc(&d_histo, histo_sizeof);
	cudaMemset(d_histo, 0, histo_sizeof); //初始化
	cudaDeviceProp  prop;
	cudaGetDeviceProperties(&prop, 0);
	auto blocks = prop.multiProcessorCount;
	histo_kernel << <blocks *2, 256 >> >(d_buffer, d_histo, SIZE_DATA);
	cudaMemcpy(histo, d_histo, histo_sizeof, cudaMemcpyDeviceToHost);
	for (char i = 0; i < 256; ++i)
		std::cout << i << " : " << histo[i] <<" times."<< std::endl;

   //cpu版***********
	for (size_t i{}; i < SIZE_DATA; ++i)
	{
		++histo[buffer[i]];
	}
	for (auto &i : histo)
		std::cout << i << ", " << std::endl;
	//***********//
	free(buffer);
}