cuda共享内存与原子操作,实现计算字符直方图

1 篇文章 0 订阅
1 篇文章 0 订阅

写在前面

这个程序再《CUDA by Example》所给出的程序代码是有问题的,原先的例子代码将所有的数据加到直方图数组的第一个元素上,其他数字为零,至少我的是这样的,所以我就自己写了一个,并且优化了使用cpu初始化原始数据的效率(STL的多线程)。

代码在这里

#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cstdio>
#include<iostream>
#include<thread>
#include<vector>


static const int SIZE_DATA = 100 * 1024 * 1024;

unsigned char * h_getrndarray(int size)
{
	const auto cpu_maxthread = std::thread::hardware_concurrency();
	unsigned char *buffer = (unsigned char*)malloc(size*sizeof(unsigned char));
	std::vector<std::thread> thread_vec;
	const auto cpusize = size / cpu_maxthread;
	auto h_thread_getrnd = [](unsigned char *data, int start, int size)
	{
		for (auto i = start; i < start + size; ++i)
			if (i < SIZE_DATA)
				data[i] = rand();
	};
	for (size_t i{}; i < cpu_maxthread; ++i)
		thread_vec.push_back(std::move(std::thread(h_thread_getrnd, buffer, i*cpusize, cpusize)));
	for (auto &i : thread_vec)
		i.join();
	return buffer;
}

__global__ void histo_kernel(unsigned char* data, size_t*d_histo, long size)
{
	    __shared__  unsigned int temp[256];
	    temp[threadIdx.x] = 0;
	    __syncthreads();
	    auto i = threadIdx.x + blockIdx.x * blockDim.x;
	    auto stride = blockDim.x * gridDim.x;
	    while (i < size)
		{
			atomicAdd(&temp[data[i]], 1);
	        i += stride;
	    }
	    __syncthreads();
		atomicAdd(&(d_histo[threadIdx.x]), temp[threadIdx.x]);
}

int main()
{
	const auto buffer_sizeof = sizeof(unsigned char)*SIZE_DATA;
	const auto histo_sizeof = sizeof(size_t) * 256;
	unsigned char *buffer = h_getrndarray(SIZE_DATA);
	size_t histo[256] = {};
	unsigned char *d_buffer;
	size_t *d_histo;
	cudaMalloc(&d_buffer, buffer_sizeof);
	cudaMemcpy(d_buffer, buffer, buffer_sizeof, cudaMemcpyHostToDevice);
	cudaMalloc(&d_histo, histo_sizeof);
	cudaMemset(d_histo, 0, histo_sizeof); //初始化
	cudaDeviceProp  prop;
	cudaGetDeviceProperties(&prop, 0);
	auto blocks = prop.multiProcessorCount;
	histo_kernel << <blocks *2, 256 >> >(d_buffer, d_histo, SIZE_DATA);
	cudaMemcpy(histo, d_histo, histo_sizeof, cudaMemcpyDeviceToHost);
	for (char i = 0; i < 256; ++i)
		std::cout << i << " : " << histo[i] <<" times."<< std::endl;

   //cpu版***********
	for (size_t i{}; i < SIZE_DATA; ++i)
	{
		++histo[buffer[i]];
	}
	for (auto &i : histo)
		std::cout << i << ", " << std::endl;
	//***********//
	free(buffer);
}

这是部分的结果:
在这里插入图片描述

以下是一个使用CUDA共享内存计算直方图的示例代码: ``` __global__ void histogram(int *input, int *output, int num_bins, int data_size) { // Define shared memory for each block __shared__ int shared_hist[BLOCK_SIZE]; // Initialize shared memory to zero for (int i = threadIdx.x; i < num_bins; i += blockDim.x) { shared_hist[i] = 0; } __syncthreads(); // Compute histogram in shared memory int tid = threadIdx.x + blockIdx.x * blockDim.x; while (tid < data_size) { atomicAdd(&shared_hist[input[tid]], 1); tid += blockDim.x * gridDim.x; } __syncthreads(); // Merge shared memory histogram into global memory for (int i = threadIdx.x; i < num_bins; i += blockDim.x) { atomicAdd(&output[i], shared_hist[i]); } } ``` 在主机代码中,您需要将数据复制到GPU内存中,然后调用这个CUDA内核来计算直方图: ``` int main() { // Allocate memory on host and device int *h_data, *d_data, *h_hist, *d_hist; int data_size = 1000000; int num_bins = 256; size_t data_bytes = data_size * sizeof(int); size_t hist_bytes = num_bins * sizeof(int); h_data = (int*)malloc(data_bytes); h_hist = (int*)malloc(hist_bytes); cudaMalloc(&d_data, data_bytes); cudaMalloc(&d_hist, hist_bytes); // Initialize data on host for (int i = 0; i < data_size; i++) { h_data[i] = rand() % num_bins; } // Copy data from host to device cudaMemcpy(d_data, h_data, data_bytes, cudaMemcpyHostToDevice); cudaMemset(d_hist, 0, hist_bytes); // Launch histogram kernel int block_size = 256; int grid_size = (data_size + block_size - 1) / block_size; histogram<<<grid_size, block_size>>>(d_data, d_hist, num_bins, data_size); // Copy histogram from device to host cudaMemcpy(h_hist, d_hist, hist_bytes, cudaMemcpyDeviceToHost); // Free memory on host and device free(h_data); free(h_hist); cudaFree(d_data); cudaFree(d_hist); return 0; } ``` 在上面的示例中,我们使用共享内存来存储每个块的直方图。在每个块中,我们首先初始化共享内存为零,然后计算每个线程负责的元素的直方图计算完成后,我们使用原子操作将每个线程的结果添加到共享内存中。最后,我们将共享内存中的结果合并到全局直方图中。请注意,在合并过程中,我们使用原子操作来避免竞争条件。 此外,我们在调用内核时使用了一个网格和块的结构。我们根据数据大小计算网格大小和块大小,并将其传递给内核。在内核中,我们使用线程ID和块ID来计算每个线程负责的
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值