根据MEC —— 优化内存与速度的卷积计算一文,用代码实现,卷积加速。
初代版本代码实现:
#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
using namespace std;
using namespace cv;
class Parallel_getL : public ParallelLoopBody
{
public:
Parallel_getL (Mat &img, Mat &Input, int kernel_width)
: m_img(img), m_input(Input), m_kernel_width(kernel_width)
{
}
virtual void operator ()(const Range& range) const
{
for (int r = range.start; r < range.end; r++) //process of for loop
{
for(int k = 0; k < m_kernel_width; k++)
{
m_img.at<uchar>(r / m_input.cols, r % m_input.cols * m_kernel_width + k)
= m_input.at<uchar>(r % m_input.cols, r / m_input.cols + k);
}
}
}
Parallel_getL& operator=(const Parallel_getL &) {
return *this;
};
private:
Mat &m_img;
Mat &m_input;
int m_kernel_width;
};
class Parallel_filter : public ParallelLoopBody
{
public:
Parallel_filter (Mat &img, Mat &Input, Mat &Output, Mat &kernel)
: m_img(img), m_input(Input), m_output(Output), m_kernel(kernel)
{
m_kernel_width = kernel.cols; //没变一条之前
Mat2Rows(m_kernel, m_kernel);
}
virtual void operator ()(const Range& range) const
{
for (int r = range.start; r < range.end; r++) //process of for loop
{
for(int k = 0; k < m_output.cols; k++)
{
uchar sum = 0;
for(int i = 0; i < m_kernel.rows; i++)
{
sum += m_img.at<uchar>(r ,k * m_kernel_width + i) * m_kernel.at<float>(i,0);
}
m_output.at<uchar>(k, r) = sum; //列变行
}
}
}
Parallel_filter& operator=(const Parallel_filter &) {
return *this;
};
private:
Mat &m_img;
Mat &m_input;
Mat &m_output;
Mat &m_kernel;
int m_kernel_width;
private:
void Mat2Rows(Mat &img, Mat &dst)
{
float **array2D = new float *[img.rows * img.cols];
for(int i = 0; i < img.rows * img.cols; i++)
{
array2D[i] = new float[1];
}
int index = 0;
for(int i = 0; i < img.cols; i++)
{
for(int j = 0; j < img.rows; j++)
{
float val = img.at<float>(i,j);
array2D[index][0] = val;
index++;
}
}
dst = Mat::zeros(Size(1, img.rows * img.cols), CV_32FC1);
for(int i = 0; i < dst.rows; i++)
{
dst.at<float>(i,0) = array2D[i][0];
}
for(int i = 0; i < dst.rows; i++)
{
delete[]array2D[i];
}
delete []array2D;
}
};
void im2col(Mat &Input, Mat &Output, Mat &kernel)
{
int Oh = Input.rows; //输出图像高度
int Ow = Input.cols; //输出图像宽度
int Kw = kernel.cols; //卷积核宽度
Mat Input_;
copyMakeBorder(Input, Input_, Kw / 2, Kw / 2, Kw / 2, Kw / 2, BORDER_CONSTANT, Scalar(0));//拓展边界
int ih = Input_.rows; //输入图像高度
BYTE *O = new BYTE[Oh * Ow];
BYTE *L = new BYTE[Ow * ih * Kw];
Mat L_ = Mat(Size(ih * Kw, Ow), CV_8UC1, L);
Output = Mat(Size(Oh ,Ow),CV_8UC1, O);
Parallel_getL parallel_getL0(L_, Input_, Kw);
parallel_for_(Range(0, ih * Ow), parallel_getL0);
cout << "L >> " << L_ << endl;
Parallel_filter parallel_filter0(L_, Input_, Output, kernel);
parallel_for_(Range(0, Oh), parallel_filter0);
#ifdef CV_CXX11
parallel_for_(Range(0, Ow * ih),[&](const Range& range)
{
for(int r = range.start; r < range,end; r++)
{
}
}
#endif
}
void gaussian_filter(Mat &image, Mat &dst)
{
}
void main()
{
Mat image = (Mat_<uchar>(5,5) << 2,2,1,1,2,
2,0,1,1,0,
2,0,1,2,0,
1,1,1,1,1,
0,0,1,0,2);
Mat kernel = (Mat_<float>(3,3) << 1,0,0,
1,1,1,
1,0,-1);
Mat output;
im2col(image, output, kernel);
cout << "output >> " << output << endl;
waitKey();
}
代码大致过程:
①原图5*5,扩充边界(上下左右填充0)-> 7*7(I)。
②将I->L
③将n*n卷积核->(n*n,1)卷积核(K)
④将L向量和K向量相乘得到O(最终结果)
细节问题:
①代码加速:重载opencv内部的parallel_for虚函数,实现两个功能
其中一个是转成L,还有一个是卷积。
②将n*n卷积核转成(n*n,1)矩阵。
③注意float类型和uchar类型差异。
最后,请认真阅读前文MEM详细讲解,理解后再看代码,会更容易接受。