TBB、OpenMP、SSE、AVX加速sobel算子

TBB加速

#include <tbb\tbb.h>
double TBB_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
	class GetGradient
	{
	public:
		Mat *gx, *gy;//x和y方向梯度图
		Mat src;
		void operator()(const tbb::blocked_range<int> &r) const{//这个方法保证本对象可以调用迭代器的参数
			int step = src.step;
			int stepxy = gx->step;
			uchar *psrcImage = src.data;
			uchar *px = gx->data;
			uchar *py = gy->data;
			for (int rindex = r.begin(); rindex != r.end(); ++rindex){//!=
				for (int j = 1; j < src.cols - 1; j++)
				{
					gx->ptr<short>(rindex)[j] = (psrcImage[((rindex - 1)*step + j + 1)] + psrcImage[(rindex*step + j + 1)] * 2 + psrcImage[((rindex + 1)*step + j + 1)] - psrcImage[((rindex - 1)*step + j - 1)] - psrcImage[(rindex*step + j - 1)] * 2 - psrcImage[((rindex + 1)*step + j - 1)]);
					gy->ptr<short>(rindex)[j] = (psrcImage[(rindex - 1)*step + j - 1] + psrcImage[(rindex - 1)*step + j] * 2 + psrcImage[(rindex - 1)*step + j + 1] - psrcImage[(rindex + 1)*step + j - 1] - psrcImage[(rindex + 1)*step + j] * 2 - psrcImage[(rindex + 1)*step + j + 1]);
					
				}
			}
			}
	};
	gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
	gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
	//遍历,跳过第一行第一列和最后一行最后一列
	//----------------------------------------------------------------------------------------//
	GetGradient m_GetGradient;
	m_GetGradient.gx = &gradX;
	m_GetGradient.gy = &gradY;
	m_GetGradient.src = srcImage;
	parallel_for(blocked_range<int>(1, srcImage.rows - 1), m_GetGradient, auto_partitioner());
	//---------------------------------------------------------------------------------------//记录算法运行时间
	//convertScaleAbs(gradX, gradX);
	//convertScaleAbs(gradY, gradY);//
	return 0;
}

OpenMP加速

#include "omp.h"
double OpenMP_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
	if (srcImage.empty() || srcImage.channels() != 1)
	{
		return -1;
	}
	if (srcImage.cols <= 3 || srcImage.rows <= 3)
	{
		return -1;
	}
	if (kernelSize != 3)
	{
		return -1;
	}
	gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
	gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
	int step = srcImage.step;
	int stepxy = gradX.step;//可以快速定位像素点在Mat中的地址
	uchar *psrcImage = srcImage.data;
	uchar *px = gradX.data;
	uchar *py = gradY.data;
	//------------------------------------------------------------------------------//
	//遍历,跳过第一行第一列和最后一行最后一列
#pragma omp parallel for
	for (int i = 1; i < srcImage.rows - 1; i++)
	{
		#pragma omp parallel for
		for (int j = 1; j < srcImage.cols - 1; j++)
		{
			gradX.ptr<short>(i)[j] = (psrcImage[((i - 1)*step + j + 1)] + psrcImage[(i*step + j + 1)] * 2 + psrcImage[((i + 1)*step + j + 1)] - psrcImage[((i - 1)*step + j - 1)] - psrcImage[(i*step + j - 1)] * 2 - psrcImage[((i + 1)*step + j - 1)]);
			gradY.ptr<short>(i)[j] = (psrcImage[(i - 1)*step + j - 1] + psrcImage[(i - 1)*step + j] * 2 + psrcImage[(i - 1)*step + j + 1] - psrcImage[(i + 1)*step + j - 1] - psrcImage[(i + 1)*step + j] * 2 - psrcImage[(i + 1)*step + j + 1]);
		}
	}
	//--------------------------------------------------------------------------------//记录算法运行时间
	return 0;
}

SSE

double SEE_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
	if (srcImage.empty() || srcImage.channels() != 1)
	{
		return -1;
	}
	if (srcImage.cols <= 3 || srcImage.rows <= 3)
	{
		return -1;
	}
	if (kernelSize != 3)
	{
		return -1;
	}
	gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
	gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
	int step = srcImage.step;
	int stepxy = gradX.step;//可以快速定位像素点在Mat中的地址
	uchar *psrcImage = srcImage.data;
	uchar *px = gradX.data;
	uchar *py = gradY.data;
	__m128i Zero = _mm_setzero_si128();//定义一个全0整形寄存器向量,用于将uchar扩充为short
	//------------------------------------------------------------------------------//
	//遍历,跳过第一行第一列和最后一行最后一列
	int BlockSize = 8;//四列为步长遍历
	int Block = (srcImage.cols - 2) / BlockSize;//循环次数
	for (int i = 1; i < srcImage.rows-1 ; i++)
	{	
		int j = 1;
		unsigned char *First = psrcImage + (i-1) * step;
		unsigned char *Second = psrcImage + i*step;
		unsigned char *Third = psrcImage + (i + 1)*step;
		for (; j < (Block-1)*BlockSize; j+=BlockSize)
		{	
			
			//--------------------获得当前像素周围的八个的值,放到寄存器里-----------------------//
			__m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(First + j - 1)),Zero);//一个寄存器存相邻16个元素,前8个像素和0合并,目的是扩充数据类型为short
			__m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(First + j)),Zero);
			__m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(First + j + 1)),Zero);
			__m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Second + j - 1)),Zero);
			__m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Second + j + 1)),Zero);
			__m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Third + j - 1)),Zero);
			__m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Third + j)),Zero);
			__m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Third + j + 1)),Zero);
			
			__m128i Gx = _mm_add_epi16(_mm_add_epi16(_mm_sub_epi16(FirstP2, FirstP0), _mm_slli_epi16(_mm_sub_epi16(SecondP2, SecondP0), 1)), _mm_sub_epi16(ThirdP2, ThirdP0));
			__m128i Gy = _mm_add_epi16(_mm_add_epi16(_mm_sub_epi16(FirstP0, ThirdP0), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1), 1)), _mm_sub_epi16(FirstP2, ThirdP2));
			//左移一位数值翻倍
			_mm_storeu_si128((__m128i*)(px + i*stepxy + j*(stepxy / step)), Gx);
			_mm_storeu_si128((__m128i*)(py + i*stepxy + j*(stepxy / step)), Gy);//保存结果	
		}
		for (; j < srcImage.cols - 1; j++)//j=j+4
		{	
			gradX.ptr<short>(i)[j] = (psrcImage[((i - 1)*step + j + 1)] + psrcImage[(i*step + j + 1)] * 2 + psrcImage[((i + 1)*step + j + 1)] - psrcImage[((i - 1)*step + j - 1)] - psrcImage[(i*step + j - 1)] * 2 - psrcImage[((i + 1)*step + j - 1)]);
			gradY.ptr<short>(i)[j] = (psrcImage[(i - 1)*step + j - 1] + psrcImage[(i - 1)*step + j] * 2 + psrcImage[(i - 1)*step + j + 1] - psrcImage[(i + 1)*step + j - 1] - psrcImage[(i + 1)*step + j] * 2 - psrcImage[(i + 1)*step + j + 1]);

		}
	}
	//--------------------------------------------------------------------------------//记录算法运行时间
	//convertScaleAbs(gradX, gradX);
	//convertScaleAbs(gradY, gradY);
	//printf("无加速情况下索贝尔算子运行时间:%lf秒\n", cost_time);
	return 0;
}

AVX

double AVX_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
	if (srcImage.empty() || srcImage.channels() != 1)
	{
		return -1;
	}
	if (srcImage.cols <= 3 || srcImage.rows <= 3)
	{
		return -1;
	}
	if (kernelSize != 3)
	{
		return -1;
	}
	//gradX = Mat::zeros(srcImage.size(), CV_8UC1);
	gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
	gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
	
	int step = srcImage.step;
	int stepxy = gradX.step;//可以快速定位像素点在Mat中的地址
	uchar *psrcImage = srcImage.data;
	uchar *px = gradX.data;
	uchar *py = gradY.data;
	__m256i Zero = _mm256_setzero_si256();//定义一个全0整形寄存器向量,用于将uchar扩充为short
	//------------------------------------------------------------------------------//
	//遍历,跳过第一行第一列和最后一行最后一列
	int BlockSize =16;//AVX步长设置为32
	int Block = (srcImage.cols - 2) / BlockSize;//循环次数
	for (int i = 1; i < srcImage.rows - 1; i++)
	{
		int j = 1;
		unsigned char *First = psrcImage + (i - 1) * step;
		unsigned char *Second = psrcImage + i*step;
		unsigned char *Third = psrcImage + (i + 1)*step;
		for (; j < (Block)*BlockSize; j += BlockSize)
		{

			//--------------------获得当前像素周围的八个的值,放到寄存器里-----------------------//
			//一个YMM寄存器存相邻32个元素,第0-7和16-23个元素和0合并,目的是扩充数据类型为short
			__m256i FirstP0 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(First + j - 1)), 216), Zero);
			__m256i FirstP1 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(First + j)), 216), Zero);
			__m256i FirstP2 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(First + j + 1)), 216), Zero);
			__m256i SecondP0 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Second + j - 1)), 216), Zero);
			__m256i SecondP2 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Second + j + 1)), 216), Zero);
			__m256i ThirdP0 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Third + j - 1)), 216), Zero);
			__m256i ThirdP1 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Third + j)), 216), Zero);
			__m256i ThirdP2 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Third + j + 1)), 216), Zero);
			//-------------------------------------------------------//

			//------------------------计算梯度-----------------------//
			__m256i Gx = _mm256_add_epi16(_mm256_add_epi16(_mm256_sub_epi16(FirstP2, FirstP0), _mm256_slli_epi16(_mm256_sub_epi16(SecondP2, SecondP0), 1)), _mm256_sub_epi16(ThirdP2, ThirdP0));
			__m256i Gy = _mm256_add_epi16(_mm256_add_epi16(_mm256_sub_epi16(FirstP0, ThirdP0), _mm256_slli_epi16(_mm256_sub_epi16(FirstP1, ThirdP1), 1)), _mm256_sub_epi16(FirstP2, ThirdP2));
		    //-------------------------重新排列-----------------------------//

			_mm256_storeu_si256((__m256i*)(px + i*stepxy + j*(stepxy / step)), Gx);
           	_mm256_storeu_si256((__m256i*)(py + i*stepxy + j*(stepxy / step)), Gy);//保存结果
		}

		for (; j < srcImage.cols - 1; j++)//j=j+4
		{
			gradX.ptr<short>(i)[j] = (psrcImage[((i - 1)*step + j + 1)] + psrcImage[(i*step + j + 1)] * 2 + psrcImage[((i + 1)*step + j + 1)] - psrcImage[((i - 1)*step + j - 1)] - psrcImage[(i*step + j - 1)] * 2 - psrcImage[((i + 1)*step + j - 1)]);
			gradY.ptr<short>(i)[j] = (psrcImage[(i - 1)*step + j - 1] + psrcImage[(i - 1)*step + j] * 2 + psrcImage[(i - 1)*step + j + 1] - psrcImage[(i + 1)*step + j - 1] - psrcImage[(i + 1)*step + j] * 2 - psrcImage[(i + 1)*step + j + 1]);
		}
		
	}
	//--------------------------------------------------------------------------------//记录算法运行时间
	//printf("无加速情况下索贝尔算子运行时间:%lf秒\n", cost_time);
	return 0;
}
  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
DPM(Deformable Part-based Model) Cascade是一种常用的目标检测算法,在OpenCV 3.0中已经支持。在DPM Cascade中,目标被分为多个部分,每个部分用一个分类器来检测,这些分类器被组成一个级联分类器,来提高检测速度和准确率。 TBBOpenMP是常用的多线程库,可以加速DPM Cascade的检测。在使用TBBOpenMP之前,需要先安装它们。在Linux系统中,可以通过以下命令安装: ``` sudo apt-get install libtbb-dev sudo apt-get install libomp-dev ``` 在Windows系统中,可以在Intel官网下载TBBOpenMP的安装程序进行安装。 下面是基于OpenCV 3.0的DPM Cascade检测代码,附带TBBOpenMP加速: ```python import cv2 import numpy as np # 加载级联分类器 cascade = cv2.CascadeClassifier('path/to/cascade.xml') # 加载图像 img = cv2.imread('path/to/image.jpg') # 转换为灰度图像 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 检测目标 rects = cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=3, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE) # 绘制矩形框 for (x, y, w, h) in rects: cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2) # 显示结果 cv2.imshow('result', img) cv2.waitKey(0) cv2.destroyAllWindows() ``` 加入TBBOpenMP加速的代码如下: ```python import cv2 import numpy as np import time import threading import multiprocessing # 加载级联分类器 cascade = cv2.CascadeClassifier('path/to/cascade.xml') # 加载图像 img = cv2.imread('path/to/image.jpg') # 转换为灰度图像 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 检测目标 start_time = time.time() rects = cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=3, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE) end_time = time.time() print('串行检测时间:', end_time-start_time) # 绘制矩形框 for (x, y, w, h) in rects: cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2) # 显示结果 cv2.imshow('result', img) cv2.waitKey(0) cv2.destroyAllWindows() # TBB并行检测 def detect_with_tbb(gray, cascade): rects = cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=3, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE) return rects start_time = time.time() tbb_results = [] tbb = threading.Thread(target=tbb_results.append, args=(detect_with_tbb(gray, cascade),)) tbb.start() tbb.join() end_time = time.time() print('TBB检测时间:', end_time-start_time) # 绘制矩形框 for (x, y, w, h) in tbb_results[0]: cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2) # 显示结果 cv2.imshow('result', img) cv2.waitKey(0) cv2.destroyAllWindows() # OpenMP并行检测 def detect_with_omp(gray, cascade): rects = cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=3, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE) return rects start_time = time.time() omp_results = [] omp = multiprocessing.Process(target=omp_results.append, args=(detect_with_omp(gray, cascade),)) omp.start() omp.join() end_time = time.time() print('OpenMP检测时间:', end_time-start_time) # 绘制矩形框 for (x, y, w, h) in omp_results[0]: cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2) # 显示结果 cv2.imshow('result', img) cv2.waitKey(0) cv2.destroyAllWindows() ``` 在上面的代码中,我们使用了Python的多线程库threading和多进程库multiprocessing来实现TBBOpenMP的并行化。通过比较串行检测时间和并行检测时间,可以看出TBBOpenMP都可以有效地加速DPM Cascade的检测。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值