https://github.com/gredx/simd-parallel-conv
https://zhuanlan.zhihu.com/p/419806079
https://www.cnblogs.com/Imageshop/p/9069650.html
https://zhuanlan.zhihu.com/p/308004749
https://zhuanlan.zhihu.com/p/83694328
SSE图像算法优化系列十八:三次卷积插值的进一步SSE优化。
基于CPU SIMD和winograd的卷积计算加速技术_
如何学习SIMD(单指令多数据流)并应用?
SSE图像算法优化系列九:灵活运用SIMD指令16倍提升Sobel边缘检测的速度(4000*3000的24位图像时间由480ms降低到30ms)。
SSE图像算法优化系列二:高斯模糊算法的全面优化过程分享(一)。
数字图像处理之高斯滤波加速优化
Opencv findcontours函数原理,以及python numpy实现
AVX256加速矩阵乘法
microsoft/ DirectXMath github SIMD
我要实现循环卷积sse,暂时没有找到比较好的写法:
优化前
void gaussianConvolution(Matrix<double>& srcIamge, Matrix<double>& desImage, Matrix<double>& kernel)
{
int kernelSize = kernel.numCols();
//卷积填充
int startOffset = -1 * int(kernelSize / 2);
for (int i = 0; i < srcIamge.numRows(); i++)
{
for (int j = 0; j < srcIamge.numCols(); j++)
{
double blurredPixel = 0.0;
for (int kx = 0; kx < kernelSize; kx++)
{
for (int ky = 0; ky < kernelSize; ky++)
{
int x = i + startOffset + kx, y = j + startOffset + ky;
GetPixelWrapAround(srcIamge, x, y);
blurredPixel += kernel.get(kx, ky)* srcIamge.get(x, y);
}
}
desImage.set(i, j, blurredPixel);
}
}
}
void GetPixelWrapAround(const Matrix<double>& image, int& x, int& y)
{
int w = image.numRows();
int h = image.numCols();
x = (x % w + w) % w;
y = (y % h + h) % h;
}
sse优化后:
void greenNoise::gaussianConvolutionSSE(Matrix<double>& srcImage, Matrix<double>& desImage, Matrix<double>& kernel)
{
int kernelSize = kernel.numCols();
int width = srcImage.numRows();
int height = srcImage.numCols();
int startOffset = -1 * static_cast<int>(kernelSize / 2);
double temp[4];
for (int i = 0; i < width; i++)
{
for (int j = 0; j < height; j++)
{
double blurredPixel = 0.0;
for (int kx = 0; kx < kernelSize; kx++)
{
int x = (i + startOffset + kx + width) % width;
for (int ky = 0; ky < kernelSize-3; ky+=4)
{
//int y = (j + startOffset + ky + height) % height;
int y0 = j + startOffset + ky + height;
int y1 = (y0 + 1)% height;
int y2 = (y0 + 2) % height;
int y3 = (y0 + 3) % height;
y0 = y0 % height;
__m256d srcValues = _mm256_set_pd(srcImage.get(x, y0), srcImage.get(x, y1), srcImage.get(x, y2), srcImage.get(x, y3));
__m256d kernelValues = _mm256_set_pd(kernel.get(kx, ky), kernel.get(kx, ky+1), kernel.get(kx, ky+2), kernel.get(kx, ky+3));
__m256d resultVec = _mm256_mul_pd(srcValues, kernelValues);
_mm256_storeu_pd(temp, resultVec);
blurredPixel += temp[0]+ temp[1] + temp[2] + temp[3] ;
}
// Process the remaining elements (if any) without SSE
for (int ky = kernelSize - kernelSize % 4; ky < kernelSize; ++ky)
{
int y = (j + startOffset + ky + height) % height;
blurredPixel += kernel.get(kx, ky) * srcImage.get(x, y);
}
}
desImage.set(i, j, blurredPixel);
}
}
}
加入多线程:
void greenNoise::parallelGaussianConvolutionSSE(Matrix<double>& srcImage, Matrix<double>& desImage, Matrix<double>& kernel)
{
int kernelSize = kernel.numCols();
int width = srcImage.numRows();
int height = srcImage.numCols();
int startOffset = -1 * static_cast<int>(kernelSize / 2);
std::vector<std::thread> threads;
//std::mutex mutex; // Mutex to control access to the result matrix
const int numThreads = std::thread::hardware_concurrency(); // Number of available threads
const int rowsPerThread = (width + numThreads - 1) / numThreads; // Rows per thread
for (int t = 0; t < numThreads; ++t)
{
threads.emplace_back([&srcImage, &desImage, &kernel, t, rowsPerThread,
kernelSize, width, height, startOffset]()
{
for (int i = t* rowsPerThread; i < std::min(width, (t +1)* rowsPerThread); i++)
{
for (int j = 0; j < height; j++)
{
double temp[4];
double blurredPixel = 0.0;
for (int kx = 0; kx < kernelSize; kx++)
{
int x = (i + startOffset + kx + width) % width;
for (int ky = 0; ky < kernelSize - 3; ky += 4)
{
//int y = (j + startOffset + ky + height) % height;
int y0 = j + startOffset + ky + height;
int y1 = (y0 + 1) % height;
int y2 = (y0 + 2) % height;
int y3 = (y0 + 3) % height;
y0 = y0 % height;
__m256d srcValues = _mm256_set_pd(srcImage.get(x, y0), srcImage.get(x, y1), srcImage.get(x, y2), srcImage.get(x, y3));
__m256d kernelValues = _mm256_set_pd(kernel.get(kx, ky), kernel.get(kx, ky + 1), kernel.get(kx, ky + 2), kernel.get(kx, ky + 3));
__m256d resultVec = _mm256_mul_pd(srcValues, kernelValues);
_mm256_storeu_pd(temp, resultVec);
blurredPixel += temp[0] + temp[1] + temp[2] + temp[3];
}
// Process the remaining elements (if any) without SSE
for (int ky = kernelSize - kernelSize % 4; ky < kernelSize; ++ky)
{
int y = (j + startOffset + ky + height) % height;
blurredPixel += kernel.get(kx, ky) * srcImage.get(x, y);
}
}
desImage.set(i, j, blurredPixel);
}
}
}
);
}
for (auto& thread : threads)
{
thread.join();
}
}