# 我的CUDA学习之旅5——OTSU二值算法（最大类间方差法、大津法）CUDA实现

6 篇文章 0 订阅
6 篇文章 0 订阅
6 篇文章 2 订阅

### 实现思路

1.统计图像灰度直方图hist数组
2.计算图像最大类间方差
3.根据计算出的最佳阈值进行二值化操作

### 实现环境

VS2013 + CUDA7.5 + Opencv2.4.13

### 实现代码

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda.h>
#include <device_functions.h>
#include <iostream>
#include <string.h>
#include <opencv2\opencv.hpp>
using namespace std;
using namespace cv;

/*

*/
__host__ int otsuThresh(int* hist, int imgHeight, int imgWidth)
{
float sum = 0;
for (int i = 0; i < 256; i++)
{
sum += i * hist[i];
}
float w0 = 0, u0 = 0;
float u = sum / (imgHeight * imgWidth);
float val = 0, maxval = 0;
float s = 0, n = 0;
int thresh = 0;
for (int i = 0; i < 256; i++)
{
s += hist[i] * i;
n += hist[i];
w0 = n / (imgHeight * imgWidth);
u0 = s / n;
val = (u - u0) * (u - u0) * w0 / (1 - w0);
if (val > maxval)
{
maxval = val;
thresh = i;
}
}
return thresh;
}

//灰度直方图统计
__global__ void imhistInCuda(unsigned char* dataIn, int* hist, int imgHeight, int imgWidth)
{
int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

if (xIndex < imgWidth && yIndex < imgHeight)
{
atomicAdd(&hist[dataIn[yIndex * imgWidth + xIndex]], 1);
}
}

//计算最大类间方差CUDA改编程序
__global__ void OTSUthresh(const int* hist, float* sum, float* s, float* n, float* val, int imgHeight, int imgWidth, int* OtsuThresh)
{
if (blockIdx.x == 0)
{
}
else
{
if (index < blockIdx.x)
{
atomicAdd(&s[blockIdx.x - 1], hist[index] * index);
}
}
if (blockIdx.x > 0)
{
int index = blockIdx.x - 1;
float u = sum[0] / (imgHeight * imgWidth);
float w0 = n[index] / (imgHeight * imgWidth);
float u0 = s[index] / n[index];
if (w0 == 1)
{
val[index] = 0;
}
else
{
val[index] = (u - u0) * (u - u0) * w0 / (1 - w0);
}
}
if (threadIdx.x == 0 && blockIdx.x == 0)
{
float maxval = 0;
for (int i = 0; i < 256; i++)
{
if (val[i] > maxval)
{
maxval = val[i];
OtsuThresh[0] = i;
OtsuThresh[1] = val[i];
}
}
}
}

//阈值化
__global__ void otsuInCuda(unsigned char* dataIn, unsigned char* dataOut, int imgHeight, int imgWidth, int* hThresh)
{
int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

if (xIndex < imgWidth && yIndex < imgHeight)
{
if (dataIn[yIndex * imgWidth + xIndex] > hThresh[0])
{
dataOut[yIndex * imgWidth + xIndex] = 255;
}
}
}

int main()
{
//传入灰度图

int imgHeight = srcImg.rows;
int imgWidth = srcImg.cols;

//opencv实现OTSU二值化
Mat dstImg1;
threshold(srcImg, dstImg1, 0, 255, THRESH_OTSU);

//CUDA改编
Mat dstImg2(imgHeight, imgWidth, CV_8UC1, Scalar(0));

//在GPU端开辟内存
unsigned char* d_in;
int* d_hist;

cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(unsigned char));
cudaMalloc((void**)&d_hist, 256 * sizeof(int));

//传入灰度图至GPU
cudaMemcpy(d_in, srcImg.data, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyHostToDevice);

dim3 blocksPerGrid1((imgWidth + 32 - 1) / 32, (imgHeight + 32 - 1) / 32);

imhistInCuda << <blocksPerGrid1, threadsPerBlock1 >> >(d_in, d_hist, imgHeight, imgWidth);

float* d_sum;
float* d_s;
float* d_n;
float *d_val;
int* d_t;

cudaMalloc((void**)&d_sum, sizeof(float));
cudaMalloc((void**)&d_s, 256 * sizeof(float));
cudaMalloc((void**)&d_n, 256 * sizeof(float));
cudaMalloc((void**)&d_val, 256 * sizeof(float));
cudaMalloc((void**)&d_t, 2 * sizeof(int));

//定义最大类间方差计算并行规格，其中257为1 + 256，
//第1个block用来计算图像灰度的sum，后256个block用于计算256个灰度对应的s, n
dim3 blocksPerGrid2(257, 1);

OTSUthresh << <blocksPerGrid2, threadsPerBlock2 >> >(d_hist, d_sum, d_s, d_n, d_val, imgHeight, imgWidth, d_t);

unsigned char* d_out;

cudaMalloc((void**)&d_out, imgHeight * imgWidth * sizeof(unsigned char));

otsuInCuda << <blocksPerGrid1, threadsPerBlock1 >> >(d_in, d_out, imgHeight, imgWidth, d_t);

//输出结果图像
cudaMemcpy(dstImg2.data, d_out, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);

调试用输出
//int th[2] = { 0, 0 };
//float n[256];
//memset(n, 0, sizeof(n));
//cudaMemcpy(th, d_t, 2 * sizeof(int), cudaMemcpyDeviceToHost);
//cudaMemcpy(n, d_n, 256 * sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(d_in);
cudaFree(d_out);
cudaFree(d_hist);
cudaFree(d_sum);
cudaFree(d_s);
cudaFree(d_n);
cudaFree(d_val);
cudaFree(d_t);

imwrite("result1.jpg", dstImg1);
imwrite("result2.jpg", dstImg2);

return 0;
}

OpenCV实现结果图

CUDA实现后图像

### 关于本文以及CUDA的一些思考

1.计算量不大，GPU加深没发挥真正的作用
2.改写的过程涉及时序的部分只能采用串行，而串行的效率GPU反而低于CPU
3.算法还未优化至最佳

1.CUDA不是万能的，很多时候一些复杂的算法无法改写，尤其是涉及时序性的
2.加速的关键在于对于GPU内存的使用规划以及原算法的性能
3.有成熟的库函数能用的时候可不用CUDA，因为提速效果不是很明显（可能是我显卡渣的原因==），例如上面的OTSU算法，OpenCV实现20ms左右，CUDA实现10ms左右
4.CUDA的时间花费大部分都在GPU传至CPU上（一般占总时间50%以上），所以在进行编码的时候能不传数据出来就尽量不要传，尽量在GPU中完成所有算法的实现，争取一进一出~
5.CUDA中传变量只能通过数组的形式，所以就算你的变量数量为1，也要定义数组，并把数组的头指针传给核函数

• 2
点赞
• 8
收藏
觉得还不错? 一键收藏
• 0
评论
04-25 421
05-24 316
06-08 4609
06-05 192
01-30
10-25
07-31 9370
12-17 1103

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。