直方图
直方图计算是计算在给定范围内数据出现次数的过程。数据可以是一组数字或一组字符。范围也可以是一组数字或一组字符。直方图计算是图像处理和计算机视觉中非常常见的操作。
例如,下面是句子“programming massively parallel processors”的直方图。
范围的线程分配与原子操作的使用
为了并行计算直方图,我们需要将给定范围内数据分成块,并将每个块分配给一个线程。每个线程将使用原子操作来更新直方图。
使用这种方法,我们可以并行地计算一组数据的直方图。然而,这种方法有一个主要缺点。全局内存上的原子操作具有较高的延迟。这意味着线程将等待原子操作完成,然后才能继续。这将导致GPU的利用率降低。
一个解决方案是在共享内存上使用原子操作。这将减少原子操作的延迟。每个块将在共享内存中初始化和更新自己的直方图。然后线程将使用全局内存上的原子操作来更新最终直方图。这种技术称为私有化(privatization)。
Code
Host代码使用随机值初始化输入数据,并调用kernel执行直方图计算。
#include <iostream>
#include <cstdio>
#include <ctime>
#include <cmath>
#include <cuda_runtime.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "histogram.cuh"
#include "helper_cuda.h"
#include "error.cuh"
const int FORTIME = 1;
int main(void) {
int n, n_bins, t_x;
n = 16384;
n_bins = 4096;
t_x = 1024;
thrust::host_vector<uint32_t> h_array(n);
thrust::host_vector<uint32_t> h_bins(n_bins);
srand(time(NULL));
for (int i = 0; i < n; ++i)
h_array[i] = rand() % n_bins;
for (int i = 0; i < n; ++i)
++h_bins[h_array[i]];
thrust::device_vector<uint32_t> array = h_array;
thrust::device_vector<uint32_t> bins(n_bins);
dim3 threads(t_x);
dim3 blocks((n + threads.x - 1) / threads.x);
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start));
//for (int i = 0; i < FORTIME; i++)
simple_histogram <<<blocks, threads >>> (thrust::raw_pointer_cast(array.data()),
thrust::raw_pointer_cast(bins.data()),
n, n_bins);
checkCudaErrors(cudaEventRecord(stop));
checkCudaErrors(cudaEventSynchronize(stop));
float elapsed_time;
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time / FORTIME);
bool success = true;
for (int i = 0; i < n_bins; ++i)
if (bins[i] != h_bins[i]) {
std::cout << "Error at " << i << ": " << bins[i] << " != " << h_bins[i] << "(Simple Histogram)" << std::endl;
success = false;
break;
}
if (success) printf("Simple Histogram: Success!\n");
bins = thrust::device_vector<uint32_t>(n_bins,0);
checkCudaErrors(cudaEventRecord(start));
//for (int i = 0; i < FORTIME; i++)
histogram_shared <<<blocks, threads, n_bins * sizeof(uint32_t) >>> (thrust::raw_pointer_cast(array.data()),
thrust::raw_pointer_cast(bins.data()),
n, n_bins);
checkCudaErrors(cudaEventRecord(stop));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time / FORTIME);
success = true;
for (int i = 0; i < n_bins; ++i)
if (bins[i] != h_bins[i]) {
std::cout << "Error at " << i << ": " << bins[i] << " != " << h_bins[i] << "(Histogram Shared)" << std::endl;
success = false;
break;
}
if (success) printf("Histogram Shared: Success!\n");
return 0;
}
下面显示了两个kernel。
__global__
void simple_histogram(unsigned int *input, unsigned int *bins,
unsigned int num_elements, unsigned int num_bins) {
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize bins
for (unsigned int binIdx = tid; binIdx < num_bins; binIdx += blockDim.x * gridDim.x)
bins[binIdx] = 0;
__syncthreads();
// Histogram
for (unsigned int i = tid; i < num_elements; i += blockDim.x * gridDim.x)
atomicAdd(&bins[input[i]], 1);
}
__global__
void histogram_shared(unsigned int *input, unsigned int *bins,
unsigned int num_elements, unsigned int num_bins) {
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Privatized bins
extern __shared__ unsigned int bins_s[];
// Initialize privatized bins
for (unsigned int binIdx = threadIdx.x; binIdx < num_bins; binIdx += blockDim.x)
bins_s[binIdx] = 0;
__syncthreads();
// Histogram
for (unsigned int i = tid; i < num_elements; i += blockDim.x * gridDim.x)
atomicAdd(&bins_s[input[i]], 1);
__syncthreads();
// Commit to global memory
for (unsigned int binIdx = threadIdx.x; binIdx < num_bins; binIdx += blockDim.x)
atomicAdd(&bins[binIdx], bins_s[binIdx]);
}
全局访问直方图
全局访问直方图计算的kernel代码首先将bins(直方图横轴)初始化为零。
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize bins
for (unsigned int binIdx = tid; binIdx < num_bins; binIdx += blockDim.x * gridDim.x)
bins[binIdx] = 0;
__syncthreads();
然后使用交错分区(interleaved partioting)计算直方图,以便实现合并访问全局内存。也就是连续线程访问连续内存。
// Histogram
for (unsigned int i = tid; i < num_elements; i += blockDim.x * gridDim.x)
atomicAdd(&bins[input[i]], 1);
共享内存方式
直方图计算的kernel代码首先会将共享内存的bins初始化为零。
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Privatized bins
extern __shared__ unsigned int bins_s[];
// Initialize privatized bins
for (unsigned int binIdx = threadIdx.x; binIdx < num_bins; binIdx += blockDim.x)
bins_s[binIdx] = 0;
__syncthreads();
然后,它使用交错分区(interleaved partioting)计算局部直方图,以实现合并访问全局内存。
// Histogram
for (unsigned int i = tid; i < num_elements; i += blockDim.x * gridDim.x)
atomicAdd(&bins_s[input[i]], 1);
__syncthreads();
最后,使用全局内存上的原子操作来更新最终的直方图。
// Commit to global memory
for (unsigned int binIdx = threadIdx.x; binIdx < num_bins; binIdx += blockDim.x)
atomicAdd(&bins[binIdx], bins_s[binIdx]);
性能分析
运行时间:
向量维度:16384
bins维度:4096
线程块维度:1024
根据运行时间分析,共享内存算法并没有与想象中一样减少耗时。这可能是因为共享内存算法引入了__syncthreads()与加载shared等复杂操作,增加了耗时。而全局访问算法可能因为目前GPGPU架构的多级cache缓存机制能够有效的降低访存延时,从而表现为较少的耗时。
笔者采用设备:RTX3060 6GB
PMPP项目提供的分析
kernel的性能是使用NvBench项目在多个gpu中测量的。研究的性能测量方法有:
内存带宽:每秒传输的数据量。
内存带宽利用率:占用内存带宽的百分比。
全局内存
共享内存
参考文献:
1、大规模并行处理器编程实战(第2版)
2、PPMP