最近做了一个GPU加速直方图运算的实验,网络上有很多GPU计算直方图的例子,但是都是单通道等比较简单的GPU并行计算demo。
这次分享一个opencv自带的计算颜色直方图特征的例子,然后转换到GPU并行处理。
直接上代码:
(1) Opencv 自带的计算HSV空间颜色直方图
//conver to HSV color space, using H and S channel to calculate histogram
bool colorHist(const cv::Mat& img, cv::Mat& feat)
{
int h_bins_=25; int s_bins_=25; int h_range_ =180,s_range_=256;
cv::Mat img_hsv;
cv::cvtColor(img, img_hsv, cv::COLOR_BGR2HSV);
// Quantize the hue and the saturation levels
int histSize[] = { h_bins_, s_bins_ };//25,25
float hrange[] = { 0, (float)h_range_ };//180
float srange[] = { 0, (float)s_range_ };//256
const float* ranges[] = { hrange, srange };
//we compute the histogram from the 0-th and 1-st channels
int channels[] = { 0, 1 };
//calculate hostogram
cv::calcHist(&img_hsv, 1, channels, cv::Mat(), // do not use mask
feat, 2, histSize, ranges,
true, // the histogram is uniform
false);
}
(2)我自己实现的GPU版直方图特征提取:
typedef struct CudaImg8Mat
{
unsigned int row;
unsigned int col;
unsigned char* mat;
};
bool HistSimilarity::calc_feat_GPU(const cv::Mat& inImg, cv::Mat& feat)
{
if (inImg.channels() != 3)
return false;
CudaImg8Mat *cudaMatA;
cudaMallocManaged(&cudaMatA, sizeof(CudaImg8Mat));
cudaMatA->row = inImg.rows;
cudaMatA->col = inImg.cols;
const int BLOCK_SIZE = 32;
dim3 DimGrid((cudaMatA->col + BLOCK_SIZE - 1) / BLOCK_SIZE, (cudaMatA->row + BLOCK_SIZE - 1) / BLOCK_SIZE);
dim3 DimBlock(BLOCK_SIZE, BLOCK_SIZE);
int imageSize = 3 * cudaMatA->col * cudaMatA->row * sizeof(unsigned char);
cudaMallocManaged(&cudaMatA->mat, imageSize);
cudaMemcpy(cudaMatA->mat, inImg.data, imageSize, cudaMemcpyHostToDevice);
float * colorHist = NULL;
int histSize = h_bins_*s_bins_ * sizeof(float);
cudaMalloc((void**)&colorHist, histSize);
cudaMemset(colorHist, 0, h_bins_*s_bins_ * sizeof(float));
calcHSHist << <DimGrid, DimBlock >> > (cudaMatA, colorHist, h_bins_, s_bins_);
feat = cv::Mat::zeros(cv::Size(h_bins_, s_bins_), CV_32FC1);
cudaMemcpy(feat.data, colorHist, histSize, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaFree(cudaMatA->mat);
cudaFree(cudaMatA);
cudaFree(colorHist);
return true;
}
对应的kernel函数实现:
kernel 里面进行通道转换和直方图统计
__global__ void calcHSHist(CudaImg8Mat *cudaMat, float * histogram, int h_bins, int s_bins)
{
int width = cudaMat->col;
int col = blockDim.x *blockIdx.x + threadIdx.x;
int row = blockDim.y*blockIdx.y + threadIdx.y;
if (col >= cudaMat->col || row >= cudaMat->row)
return;
float hscale = 0.5f;// 180 * (1.f / 360.f);
float b = cudaMat->mat[(row*width + col) * 3 + 0],
g = cudaMat->mat[(row*width + col) * 3 + 1],
r = cudaMat->mat[(row*width + col) * 3 + 2];
uchar max, min, tmp;
float h = 0, s = 0;
tmp = b > g ? b : g;
max = tmp > r ? tmp : r;
if (max != 0)
{
tmp = b > g ? g : b;
min = tmp > r ? r : tmp;
if (max == min)
{
s = 0;//s
}
else
{
float delta = max - min;
if (r == max)
h = 60 * (g - b) / delta;
else if (g == max)
h = 60 * (2 + (b - r) / delta);
else
h = 60 * (4 + (r - g) / delta);
if (h < 0)
h += 360;
s = delta / max;//s
}
}
//统计颜色直方图
float h_range = 180.0f, s_range = 256.0f;
int idx0 = static_cast<int>(static_cast<uchar>(h* hscale) * h_bins / h_range);
int idx1 = static_cast<int>((uchar)(s * 255) * s_bins / s_range);
atomicAdd(&histogram[idx0*h_bins + idx1], 1);
}
遗留问题:尝试用过共享内存的方式来进行GPU直方图计算,出现了两个问题,一个是:速度并没有快;另一个:数据结果不一样,还需要继续改进。