TBB加速
#include <tbb\tbb.h>
double TBB_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
class GetGradient
{
public:
Mat *gx, *gy;//x和y方向梯度图
Mat src;
void operator()(const tbb::blocked_range<int> &r) const{//这个方法保证本对象可以调用迭代器的参数
int step = src.step;
int stepxy = gx->step;
uchar *psrcImage = src.data;
uchar *px = gx->data;
uchar *py = gy->data;
for (int rindex = r.begin(); rindex != r.end(); ++rindex){//!=
for (int j = 1; j < src.cols - 1; j++)
{
gx->ptr<short>(rindex)[j] = (psrcImage[((rindex - 1)*step + j + 1)] + psrcImage[(rindex*step + j + 1)] * 2 + psrcImage[((rindex + 1)*step + j + 1)] - psrcImage[((rindex - 1)*step + j - 1)] - psrcImage[(rindex*step + j - 1)] * 2 - psrcImage[((rindex + 1)*step + j - 1)]);
gy->ptr<short>(rindex)[j] = (psrcImage[(rindex - 1)*step + j - 1] + psrcImage[(rindex - 1)*step + j] * 2 + psrcImage[(rindex - 1)*step + j + 1] - psrcImage[(rindex + 1)*step + j - 1] - psrcImage[(rindex + 1)*step + j] * 2 - psrcImage[(rindex + 1)*step + j + 1]);
}
}
}
};
gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
//遍历,跳过第一行第一列和最后一行最后一列
//----------------------------------------------------------------------------------------//
GetGradient m_GetGradient;
m_GetGradient.gx = &gradX;
m_GetGradient.gy = &gradY;
m_GetGradient.src = srcImage;
parallel_for(blocked_range<int>(1, srcImage.rows - 1), m_GetGradient, auto_partitioner());
//---------------------------------------------------------------------------------------//记录算法运行时间
//convertScaleAbs(gradX, gradX);
//convertScaleAbs(gradY, gradY);//
return 0;
}
OpenMP加速
#include "omp.h"
double OpenMP_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
if (srcImage.empty() || srcImage.channels() != 1)
{
return -1;
}
if (srcImage.cols <= 3 || srcImage.rows <= 3)
{
return -1;
}
if (kernelSize != 3)
{
return -1;
}
gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
int step = srcImage.step;
int stepxy = gradX.step;//可以快速定位像素点在Mat中的地址
uchar *psrcImage = srcImage.data;
uchar *px = gradX.data;
uchar *py = gradY.data;
//------------------------------------------------------------------------------//
//遍历,跳过第一行第一列和最后一行最后一列
#pragma omp parallel for
for (int i = 1; i < srcImage.rows - 1; i++)
{
#pragma omp parallel for
for (int j = 1; j < srcImage.cols - 1; j++)
{
gradX.ptr<short>(i)[j] = (psrcImage[((i - 1)*step + j + 1)] + psrcImage[(i*step + j + 1)] * 2 + psrcImage[((i + 1)*step + j + 1)] - psrcImage[((i - 1)*step + j - 1)] - psrcImage[(i*step + j - 1)] * 2 - psrcImage[((i + 1)*step + j - 1)]);
gradY.ptr<short>(i)[j] = (psrcImage[(i - 1)*step + j - 1] + psrcImage[(i - 1)*step + j] * 2 + psrcImage[(i - 1)*step + j + 1] - psrcImage[(i + 1)*step + j - 1] - psrcImage[(i + 1)*step + j] * 2 - psrcImage[(i + 1)*step + j + 1]);
}
}
//--------------------------------------------------------------------------------//记录算法运行时间
return 0;
}
SSE
double SEE_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
if (srcImage.empty() || srcImage.channels() != 1)
{
return -1;
}
if (srcImage.cols <= 3 || srcImage.rows <= 3)
{
return -1;
}
if (kernelSize != 3)
{
return -1;
}
gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
int step = srcImage.step;
int stepxy = gradX.step;//可以快速定位像素点在Mat中的地址
uchar *psrcImage = srcImage.data;
uchar *px = gradX.data;
uchar *py = gradY.data;
__m128i Zero = _mm_setzero_si128();//定义一个全0整形寄存器向量,用于将uchar扩充为short
//------------------------------------------------------------------------------//
//遍历,跳过第一行第一列和最后一行最后一列
int BlockSize = 8;//四列为步长遍历
int Block = (srcImage.cols - 2) / BlockSize;//循环次数
for (int i = 1; i < srcImage.rows-1 ; i++)
{
int j = 1;
unsigned char *First = psrcImage + (i-1) * step;
unsigned char *Second = psrcImage + i*step;
unsigned char *Third = psrcImage + (i + 1)*step;
for (; j < (Block-1)*BlockSize; j+=BlockSize)
{
//--------------------获得当前像素周围的八个的值,放到寄存器里-----------------------//
__m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(First + j - 1)),Zero);//一个寄存器存相邻16个元素,前8个像素和0合并,目的是扩充数据类型为short
__m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(First + j)),Zero);
__m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(First + j + 1)),Zero);
__m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Second + j - 1)),Zero);
__m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Second + j + 1)),Zero);
__m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Third + j - 1)),Zero);
__m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Third + j)),Zero);
__m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(Third + j + 1)),Zero);
__m128i Gx = _mm_add_epi16(_mm_add_epi16(_mm_sub_epi16(FirstP2, FirstP0), _mm_slli_epi16(_mm_sub_epi16(SecondP2, SecondP0), 1)), _mm_sub_epi16(ThirdP2, ThirdP0));
__m128i Gy = _mm_add_epi16(_mm_add_epi16(_mm_sub_epi16(FirstP0, ThirdP0), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1), 1)), _mm_sub_epi16(FirstP2, ThirdP2));
//左移一位数值翻倍
_mm_storeu_si128((__m128i*)(px + i*stepxy + j*(stepxy / step)), Gx);
_mm_storeu_si128((__m128i*)(py + i*stepxy + j*(stepxy / step)), Gy);//保存结果
}
for (; j < srcImage.cols - 1; j++)//j=j+4
{
gradX.ptr<short>(i)[j] = (psrcImage[((i - 1)*step + j + 1)] + psrcImage[(i*step + j + 1)] * 2 + psrcImage[((i + 1)*step + j + 1)] - psrcImage[((i - 1)*step + j - 1)] - psrcImage[(i*step + j - 1)] * 2 - psrcImage[((i + 1)*step + j - 1)]);
gradY.ptr<short>(i)[j] = (psrcImage[(i - 1)*step + j - 1] + psrcImage[(i - 1)*step + j] * 2 + psrcImage[(i - 1)*step + j + 1] - psrcImage[(i + 1)*step + j - 1] - psrcImage[(i + 1)*step + j] * 2 - psrcImage[(i + 1)*step + j + 1]);
}
}
//--------------------------------------------------------------------------------//记录算法运行时间
//convertScaleAbs(gradX, gradX);
//convertScaleAbs(gradY, gradY);
//printf("无加速情况下索贝尔算子运行时间:%lf秒\n", cost_time);
return 0;
}
AVX
double AVX_Sobel(Mat srcImage, int kernelSize, Mat &gradX, Mat &gradY){
if (srcImage.empty() || srcImage.channels() != 1)
{
return -1;
}
if (srcImage.cols <= 3 || srcImage.rows <= 3)
{
return -1;
}
if (kernelSize != 3)
{
return -1;
}
//gradX = Mat::zeros(srcImage.size(), CV_8UC1);
gradX = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存x方向梯度图
gradY = Mat::zeros(srcImage.size(), CV_16SC1);//定义相同尺寸的矩阵保存y方向梯度图
int step = srcImage.step;
int stepxy = gradX.step;//可以快速定位像素点在Mat中的地址
uchar *psrcImage = srcImage.data;
uchar *px = gradX.data;
uchar *py = gradY.data;
__m256i Zero = _mm256_setzero_si256();//定义一个全0整形寄存器向量,用于将uchar扩充为short
//------------------------------------------------------------------------------//
//遍历,跳过第一行第一列和最后一行最后一列
int BlockSize =16;//AVX步长设置为32
int Block = (srcImage.cols - 2) / BlockSize;//循环次数
for (int i = 1; i < srcImage.rows - 1; i++)
{
int j = 1;
unsigned char *First = psrcImage + (i - 1) * step;
unsigned char *Second = psrcImage + i*step;
unsigned char *Third = psrcImage + (i + 1)*step;
for (; j < (Block)*BlockSize; j += BlockSize)
{
//--------------------获得当前像素周围的八个的值,放到寄存器里-----------------------//
//一个YMM寄存器存相邻32个元素,第0-7和16-23个元素和0合并,目的是扩充数据类型为short
__m256i FirstP0 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(First + j - 1)), 216), Zero);
__m256i FirstP1 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(First + j)), 216), Zero);
__m256i FirstP2 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(First + j + 1)), 216), Zero);
__m256i SecondP0 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Second + j - 1)), 216), Zero);
__m256i SecondP2 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Second + j + 1)), 216), Zero);
__m256i ThirdP0 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Third + j - 1)), 216), Zero);
__m256i ThirdP1 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Third + j)), 216), Zero);
__m256i ThirdP2 = _mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)(Third + j + 1)), 216), Zero);
//-------------------------------------------------------//
//------------------------计算梯度-----------------------//
__m256i Gx = _mm256_add_epi16(_mm256_add_epi16(_mm256_sub_epi16(FirstP2, FirstP0), _mm256_slli_epi16(_mm256_sub_epi16(SecondP2, SecondP0), 1)), _mm256_sub_epi16(ThirdP2, ThirdP0));
__m256i Gy = _mm256_add_epi16(_mm256_add_epi16(_mm256_sub_epi16(FirstP0, ThirdP0), _mm256_slli_epi16(_mm256_sub_epi16(FirstP1, ThirdP1), 1)), _mm256_sub_epi16(FirstP2, ThirdP2));
//-------------------------重新排列-----------------------------//
_mm256_storeu_si256((__m256i*)(px + i*stepxy + j*(stepxy / step)), Gx);
_mm256_storeu_si256((__m256i*)(py + i*stepxy + j*(stepxy / step)), Gy);//保存结果
}
for (; j < srcImage.cols - 1; j++)//j=j+4
{
gradX.ptr<short>(i)[j] = (psrcImage[((i - 1)*step + j + 1)] + psrcImage[(i*step + j + 1)] * 2 + psrcImage[((i + 1)*step + j + 1)] - psrcImage[((i - 1)*step + j - 1)] - psrcImage[(i*step + j - 1)] * 2 - psrcImage[((i + 1)*step + j - 1)]);
gradY.ptr<short>(i)[j] = (psrcImage[(i - 1)*step + j - 1] + psrcImage[(i - 1)*step + j] * 2 + psrcImage[(i - 1)*step + j + 1] - psrcImage[(i + 1)*step + j - 1] - psrcImage[(i + 1)*step + j] * 2 - psrcImage[(i + 1)*step + j + 1]);
}
}
//--------------------------------------------------------------------------------//记录算法运行时间
//printf("无加速情况下索贝尔算子运行时间:%lf秒\n", cost_time);
return 0;
}