CUDA学习笔记（1）——腐蚀

最新推荐文章于 2024-07-15 14:39:02 发布

drysbml

最新推荐文章于 2024-07-15 14:39:02 发布

阅读量875

点赞数

文章标签： CUDA

本文链接：https://blog.csdn.net/drysbml/article/details/81275986

版权

前言

本篇博客是在我学习别人的博客内容上实验出来的，所以大部分并非原创内容，参考地址：秘籍传送。

在此篇博客的基础上，我将其中的程序实验，并自己阅读了一番，下面是具体内容。

主要目标

是对一张转换为灰度图像进行腐蚀和膨胀，通过CUDA在GPU实现此过程，将其输出在cpu上，保存在本地文件中（*.jpg）。

具体实现

图像腐蚀：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<opencv2/opencv.hpp>
#include <stdio.h>

using namespace cv;
using namespace std;

__global__ void test(unsigned char *d_in, unsigned char *d_out, int iwidth, int iheight, Size erodeElement){
	int xIndex = threadIdx.x + blockDim.x*blockIdx.x;
	int yIndex = threadIdx.y + blockDim.y*blockIdx.y;
	
	int ewidth = erodeElement.width/2;
	int ehight = erodeElement.height/2;
	d_out[xIndex + yIndex * iwidth] = d_in[xIndex + yIndex * iwidth];
	if ((xIndex > ewidth) && (yIndex > ehight) && (xIndex+ewidth < iwidth) && (yIndex+ehight < iheight)){
		for (int i = -ehight; i <= ehight; i++){
			for (int j = -ewidth; j <= ewidth; j++){
				if (d_in[xIndex + yIndex * iwidth] > d_in[(yIndex + j) * iwidth + xIndex + i]){
					d_out[xIndex + yIndex * iwidth] = d_in[(yIndex + j) * iwidth + xIndex + i];
				}
			}
		}
	}
}

int main(){

	Mat imgsrc = imread("D:\\che.jpg", 0);
	if (imgsrc.data == NULL){
		printf("图片读取错误！\n");
		return -1;
	}

	unsigned char *d_in;
	unsigned char *d_out;
	
	int width = imgsrc.cols;
	int height = imgsrc.rows;
	//unsigned char out[];	是否可以通过数组和Mat转换的方式来实现将数据保存，虽然相比于直接放在Mat.data上要麻烦
	cudaError cudaStatus;
	cudaStatus = cudaMalloc((void**)&d_in, width*height*sizeof(unsigned char));
	if (cudaStatus != cudaSuccess){
		fprintf(stderr, "cudaMalloc failed!\n");
		cudaFree(d_in);
		return 1;
	}

	cudaStatus = cudaMalloc((void**)&d_out, width*height*sizeof(unsigned char));
	if (cudaStatus != cudaSuccess){
		fprintf(stderr, "cudaMalloc failed!\n");
		cudaFree(d_in);
		cudaFree(d_out);
		return 1;
	}

	cudaStatus = cudaMemcpy(d_in, imgsrc.data, width*height*sizeof(unsigned char), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess){
		fprintf(stderr, "cudaMemcpy failed!\n");
		cudaFree(d_in);
		cudaFree(d_out);
		return 1;
	}

	dim3 threads(32, 32);
	dim3 blocks((width + threads.x - 1)/threads.x, (height + threads.y - 1)/threads.y);
	Size Element(3, 5);

	test << <blocks, threads >> >(d_in, d_out, width, height, Element);

	Mat m(height, width, CV_8UC1, cv::Scalar::all(0));

	cudaStatus = cudaMemcpy(m.data, d_out, width*height*sizeof(unsigned char), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess){
		fprintf(stderr, "cudaMemcpy2 failed!\n");
		cudaFree(d_in);
		cudaFree(d_out);
		return 1;
	}

	imwrite("D:\\test.jpg", m);

	printf("%d,%d", width, height);

	getchar();
	return 0;
}

代码大致解释：

代码主要分几块：首先，对存放数组的分配地址并初始化（主要是对传入GPU的数组初始化）；其次，是调用核函数执行具体任务；最后，将执行后的结果数组（d_out）复制给内存中负责显示的Mat数据类型（m），并将其保存到本地。

具体解释：

        Mat imgsrc = imread("D:\\che.jpg", 0);
	if (imgsrc.data == NULL){
		printf("图片读取错误！\n");
		return -1;
	}
	..........

	cudaStatus = cudaMemcpy(d_in, imgsrc.data, width*height*sizeof(unsigned char), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess){
		fprintf(stderr, "cudaMemcpy failed!\n");
		cudaFree(d_in);
		cudaFree(d_out);
		return 1;
	}

将图片按灰度图像读入，并为之后的GPU所需数组分配空间，其中，cudaStatus是cudaError类型，来提示错误，当然也可以加一句：printf("%s\n", cudaGetErrorString(cudaStatus));来具体输出什么错误（虽然有时候提示的错误莫名其妙），网上有关于各种错误提示的解决方法。

        dim3 threads(32, 32);
	dim3 blocks((width + threads.x - 1)/threads.x, (height + threads.y - 1)/threads.y);
	Size Element(3, 5);

	test << <blocks, threads >> >(d_in, d_out, width, height, Element);

这里是定义核函数的参数和执行核函数，这里的参数在参考的博客中有详细的解释。

特别的，对于定义的blocks，我的理解是：平常对于Grid含有的block数量是：总的行数/breads（即每个block所含有的thread，当然也有例外，比如参考博客的上一篇：秘籍传送2），而这里将breads设置确定了，就是（32，32），则若是不同的图片加载进来，需要对其有些适应性的改变，举个特殊的例子，若是图像为1x1的，那么总不能将blocks设置为1/32，而它实际上或许只需要一个线程来处理（我不太肯定，恳请各位大神给出确定答案，非常感谢！），则应该将一个block块交给它，所以算下来的blocks应该是1块（在上面的定义式算下来的结果是1块）。

下面是核函数：

        int ewidth = erodeElement.width/2;
	int ehight = erodeElement.height/2;

在腐蚀上，可以上网查询有关概念，这里的程序的逻辑是：将一个像素点和周围比较，将最小值赋值给这个像素点，至于“周围”有多大，是在主程序中的Size Element(3, 5);定义的。宽3高5的矩形，通过此矩形来腐蚀图像。而具体实现起来，则是以现像素点为原点，减加一半的宽度和高度来遍历周围。

if ((xIndex > ewidth) && (yIndex > ehight) && (xIndex+ewidth < iwidth) && (yIndex+ehight < iheight))

这一句是防止越界，因为要从中心点加减来遍历，要是超出图像的边界，则要不无效要不出错，上面四个关系对应于四周边界。

if (d_in[xIndex + yIndex * iwidth] > d_in[(yIndex + j) * iwidth + xIndex + i]){
	d_out[xIndex + yIndex * iwidth] = d_in[(yIndex + j) * iwidth + xIndex + i];
}

这个判断则是取最小值存入d_out数组中。

最后是保存：

        Mat m(height, width, CV_8UC1, cv::Scalar::all(0));

        cudaStatus = cudaMemcpy(m.data, d_out, width*height*sizeof(unsigned char), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess){
		fprintf(stderr, "cudaMemcpy2 failed!\n");
		cudaFree(d_in);
		cudaFree(d_out);
		return 1;
	}

	imwrite("D:\\test.jpg", m);

将数据复制到内存的m变量上，将其保存到本地。