直方图概念:给定一个包含一组元素的数据集,直方图表示每个元素的出现频率。
一、在CPU上计算直方图
1 #include "book.h" 2 #include <stdio.h> 3 #include <cuda_runtime.h> 4 #include <device_launch_parameters.h> 5 #include <time.h> 6 7 #define SIZE (100*1024*1024) 8 9 int main(void) { 10 unsigned char *buffer = 11 (unsigned char*)big_random_block(SIZE); 12 13 // capture the start time 14 clock_t start, stop; 15 start = clock(); 16 17 unsigned int histo[256]; 18 for (int i = 0; i<256; i++) 19 histo[i] = 0; 20 21 for (int i = 0; i < SIZE; i++) 22 histo[buffer[i]]++; 23 stop = clock(); 24 float elapsedTime = (float)(stop - start) / 25 (float)CLOCKS_PER_SEC * 1000.0f; 26 printf("Time to generate: %3.1f ms\n", elapsedTime); 27 28 long histoCount = 0; 29 for (int i = 0; i<256; i++) { 30 histoCount += histo[i]; 31 } 32 33 printf("Histogram Sum: %ld\n", histoCount); 34 35 free(buffer); 36 return 0; 37 }
二、在GPU上使用全局内存原子操作计算直方图
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include <device_launch_parameters.h> 4 #include "book.h" 5 #include "gpu_anim.h" 6 #define SIZE (100*1024*1024) 7 8 __global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){ 9 int i = threadIdx.x + blockIdx.x * blockDim.x; 10 int stride = blockDim.x * gridDim.x; 11 while (i < size){ 12 atomicAdd(&histo[buffer[i]], 1); 13 i += stride; 14 } 15 } 16 int main(void){ 17 unsigned char *buffer = (unsigned char*)big_random_block(SIZE); 18 /*测量执行性能,初始化计时事件*/ 19 cudaEvent_t start, stop; 20 HANDLE_ERROR(cudaEventCreate(&start)); 21 HANDLE_ERROR(cudaEventCreate(&stop)); 22 HANDLE_ERROR(cudaEventRecord(start, 0)); 23 24 //在GPU上为文件的数据分配内存 25 unsigned char *dev_buffer; 26 unsigned int *dev_histo; 27 HANDLE_ERROR(cudaMalloc((void**)&dev_buffer, SIZE)); 28 HANDLE_ERROR(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice)); 29 HANDLE_ERROR(cudaMalloc((void**)&dev_histo, 256 * sizeof(int))); 30 HANDLE_ERROR(cudaMemset(dev_histo, 0, 256 * sizeof(int))); 31 cudaDeviceProp prop; 32 HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0)); 33 int blocks = prop.multiProcessorCount; 34 histo_kernel << <blocks * 2, 256 >> >(dev_buffer, SIZE, dev_histo); 35 unsigned int histo[256]; 36 HANDLE_ERROR(cudaMemcpy(histo, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost)); 37 //得到停止时间并显示计时结果 38 HANDLE_ERROR(cudaEventRecord(stop, 0)); 39 HANDLE_ERROR(cudaEventSynchronize(stop)); 40 float elapsedTime; 41 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop)); 42 printf("Time to generate: %3.1f ms\n", elapsedTime); 43 44 long histoCount = 0; 45 for (int i = 0; i < 256; i++){ 46 histoCount += histo[i]; 47 } 48 printf("Histogram Sum: %1d\n", histoCount); 49 50 //验证与基于CPU计算得到的结果是相同的 51 for (int i = 0; i < SIZE; i++) 52 histo[buffer[i]]--; 53 for (int i = 0; i < 256; i++){ 54 if (histo[i] != 0) 55 printf("Failure at %d!\n", i); 56 } 57 //在程序结束时要释放已分配的CUDA事件,GPU内存和主机内存 58 HANDLE_ERROR(cudaEventDestroy(start)); 59 HANDLE_ERROR(cudaEventDestroy(stop)); 60 cudaFree(dev_histo); 61 cudaFree(dev_buffer); 62 free(buffer); 63 return 0; 64 }
在GPU上运行时间比在CPU上运行时间长,性能不理想。
三、在GPU上使用共享内存原子操作计算直方图
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include <device_launch_parameters.h> 4 #include "book.h" 5 #include "gpu_anim.h" 6 #define SIZE (100*1024*1024) 7 8 __global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){ 9 __shared__ unsigned int temp[256]; 10 temp[threadIdx.x] = 0; 11 __syncthreads(); 12 int i = threadIdx.x + blockIdx.x * blockDim.x; 13 int offset = blockDim.x *gridDim.x; 14 while (i<size){ 15 atomicAdd(&temp[buffer[i]], 1); 16 i += offset; 17 } 18 __syncthreads(); 19 atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]); 20 } 21 int main(void){ 22 unsigned char *buffer = (unsigned char*)big_random_block(SIZE); 23 /*测量执行性能,初始化计时事件*/ 24 cudaEvent_t start, stop; 25 HANDLE_ERROR(cudaEventCreate(&start)); 26 HANDLE_ERROR(cudaEventCreate(&stop)); 27 HANDLE_ERROR(cudaEventRecord(start, 0)); 28 29 //在GPU上为文件的数据分配内存 30 unsigned char *dev_buffer; 31 unsigned int *dev_histo; 32 HANDLE_ERROR(cudaMalloc((void**)&dev_buffer, SIZE)); 33 HANDLE_ERROR(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice)); 34 HANDLE_ERROR(cudaMalloc((void**)&dev_histo, 256 * sizeof(int))); 35 HANDLE_ERROR(cudaMemset(dev_histo, 0, 256 * sizeof(int))); 36 cudaDeviceProp prop; 37 HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0)); 38 int blocks = prop.multiProcessorCount; 39 histo_kernel << <blocks * 2, 256 >> >(dev_buffer, SIZE, dev_histo); 40 unsigned int histo[256]; 41 HANDLE_ERROR(cudaMemcpy(histo, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost)); 42 //得到停止时间并显示计时结果 43 HANDLE_ERROR(cudaEventRecord(stop, 0)); 44 HANDLE_ERROR(cudaEventSynchronize(stop)); 45 float elapsedTime; 46 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop)); 47 printf("Time to generate: %3.1f ms\n", elapsedTime); 48 49 long histoCount = 0; 50 for (int i = 0; i < 256; i++){ 51 histoCount += histo[i]; 52 } 53 printf("Histogram Sum: %1d\n", histoCount); 54 55 //验证与基于CPU计算得到的结果是相同的 56 for (int i = 0; i < SIZE; i++) 57 histo[buffer[i]]--; 58 for (int i = 0; i < 256; i++){ 59 if (histo[i] != 0) 60 printf("Failure at %d!\n", i); 61 } 62 //在程序结束时要释放已分配的CUDA事件,GPU内存和主机内存 63 HANDLE_ERROR(cudaEventDestroy(start)); 64 HANDLE_ERROR(cudaEventDestroy(stop)); 65 cudaFree(dev_histo); 66 cudaFree(dev_buffer); 67 free(buffer); 68 return 0; 69 }
运行时间缩短很多,性能提升明显。