Deriche 模糊的实现、以及指令集加速方法

最新推荐文章于 2024-04-13 08:43:07 发布

大熊Ers

最新推荐文章于 2024-04-13 08:43:07 发布

阅读量796

点赞数 2

分类专栏：视觉算法文章标签：算法计算机视觉 c++

本文链接：https://blog.csdn.net/u011598727/article/details/118280374

版权

本文介绍了Deriche边缘检测算法的实现原理，提供了基于C++的OpenCV简单实现，并指出其在效率上的不足。随后讨论了通过AVX2指令集进行加速的方法，实现了约8倍的性能提升，但遇到32位浮点到8位转换在AVX512中才有直接指令的问题，通过间接方式解决。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

具体的实现原理参考：

https://en.wikipedia.org/wiki/Deriche_edge_detector （想看原文的找梯子）
公式方法
参数公式

首先是C代码部分（opencv下开发）

void Deriche(cv::Mat& img, cv::Mat* outimg, double aph)
{
   
	double alpha = aph;
	double exalpha = exp(-alpha);
	double exalpha2 = exp(-2 * alpha);
	double kr = ((1 - exalpha) * (1 - exalpha)) / (1 + 2 * alpha * exalpha - exalpha2);
	double a1 = kr;
	double a2 = kr * exalpha * (alpha - 1);
	double a3 = kr * exalpha * (alpha + 1);
	double a4 = -kr * exalpha2;
	double b1 = 2 * exalpha;
	double b2 = -exalpha2;
	double c1 = 1.0;
	double c2 = 1.0;

	//边界部分需要除以2

	int row = img.rows;
	int col = img.cols;
	Mat floatimg;
	img.convertTo(floatimg, CV_32FC1);
	Mat midImg(img.size(), CV_32FC1);

	Mat disp = floatimg.clone();

	float RTF1, RTF2, RTF3;
	for (int i = 0; i < row; i++)
	{
   
		float* ptrtmp = midImg.ptr<float>(i, 0);
		float* ptrimg = floatimg.ptr<float>(i, 0);
		RTF3 = *(ptrimg) * (a1 + a2 + b1 + b2) / 2;
		RTF2 = *(ptrimg + 1) * a1 / 2 + *(ptrimg)*a2 / 2 + RTF3 * (b1 + b2);
		for (int j = 2; j < col; j++)
		{
   
			RTF1 = *(ptrimg + j) * a1 + *(ptrimg + j - 1) * a2 + RTF2 * b1 + RTF3 * b2;
			*(ptrtmp + j) = RTF1;
			RTF3 = RTF2;
			RTF2 = RTF1;
		}
	}
	for (int i = 0; i < row; i++)
	{
   
		float* ptrtmp = midImg.ptr<float>(i, 0);
		float* ptrimg = floatimg.ptr<float>(i, 0);
		RTF3 = *(ptrimg + col - 1) * (a1 + a2 + b1 + b2) / 2;
		RTF2 = *(ptrimg + col - 2) * a1 / 2 + *(ptrimg + col - 1) * a2 / 2 + RTF3 * (b1 + b2);
		for (int j = col - 3; j > -1; j--)
		{
   
			RTF1 = *(ptrimg + j + 1) * a3 + *(ptrimg + j + 2) * a4 + RTF2 * b1 + RTF3 * b2;
			RTF3 = RTF2;
			RTF2 = RTF1;
			*(ptrtmp + j) = c1 * (*(ptrtmp + j) + RTF1);
		}
	}
	for (int j = 0; j < col; j++)
	{
   
		RTF3 = midImg.at<float>(0, j) * (a1 + a2 + b1 + b2) / 2;
		RTF2 = midImg.at<float>(1, j) * a1 / 2 + midImg.at<float>(0, j) * a2 / 2 + RTF3 * (b1 + b2);
		for (int i = 2; i < row; i++)
		{
   
			RTF1 = midImg.at<float>(i, j) * a1 + midImg.at<float>(i - 1, j) * a2 + RTF2 * b1 + RTF3 * b2;
			RTF3 = RTF2;
			RTF2 = RTF1;
			floatimg.at<float>(i, j) = RTF1;
		}
		floatimg.at<float>(0, j) = floatimg.at<float>(1, j);
	}
	for (int j = 0; j < col; j++)
	{
   
		RTF3 = midImg.at<float>(row - 1, j) * (a1 + a2 + b1 + b2) / 2;
		RTF2 = midImg.at<float>(row - 2, j) * a1 / 2 + midImg.at<float>(row - 1, j) * a2 / 2 + RTF3 * (b1 + b2);
		for (int i = row - 3; i > -1; i--)
		{
   
			RTF1 = midImg.at<float>(i + 1, j) * a3 + midImg.at<float>(i + 2, j) * a4 + RTF2 * b1 + RTF3 * b2;
			RTF3 = RTF2;
			RTF2 = RTF1;
			floatimg.at<float>(i, j) = c2 * (floatimg.at<float>(i, j) + RTF1);
			disp.at<float>(i, j) = RTF1;
		}
	}
	floatimg.convertTo(*outimg, CV_8UC1);
}

很简单，4次循环搞定（为了偷懒边缘部分未补齐）
其中比较关键的一点是图片边缘部分的初始值、公式原理上并未给出；
处理不好的情况下大半径模糊的情况下会出现失真

当然这只是实现的代码原理、效率上很差的。下一步需要使用指令集加速（采用AVX2）。

void DericheSpeed(cv::Mat& img, cv::Mat* outimg, double aph)
{
   
	//判断输入地址与输出地址是否相同
	Mat Outimage;
	bool needNew = img.data == outimg->data || img.size != outimg->size || img.type() != outimg->type();
	if (needNew)
	{
   
		Outimage = Mat(img.size(