这里不做padding处理 所以出来的图像尺寸为
size-kernelsize+1
懒的多说
上才艺~~
恐龙抗狼
/// \brief convolution
/// \param input chw格式
/// \param kernel chw格式
/// \param outPtr 单通道hw
/// \param cols 输入图像宽度
/// \param rows 输入图像高度
/// \param kRow 卷积核高
/// \param kCol 卷积核宽
/// \param kChannel 卷积核通道数
///
void convolution5(float *input,float *kernel,float *&outPtr,int cols,int rows,int kRow,int kCol,int kChannel){
uint32_t out_rows = 1 + (rows) - (kRow);
uint32_t out_cols = 1 + (cols) - (kCol);
uint32_t out_size = out_rows * out_cols;
outPtr=new float[out_size];
memset(outPtr,0,sizeof(float)*out_size);
int blocks=out_cols/8;
int totalSize=blocks*8;
float *aPtr=input;
#pragma omp parallel for
for (int r = 0; r < out_rows; r++) {
float *outBuffer=outPtr+r*out_cols;
for(int ch=0;ch<kChannel;ch++){
float *tempKernel=kernel+ch*kRow*kCol;
float *a1Ptr=aPtr+ch*cols*rows;
for (int kr = 0; kr < kRow; kr++) {
int kRIndex = kr * kCol;
int rIndex = (r + kr) * cols;
float *a2Ptr=a1Ptr+rIndex;
for (int kc = 0; kc < kCol; kc++) {
int kRCIndex = kRIndex + kc;
const float tempCof=tempKernel[kRCIndex];
float *a3Ptr=a2Ptr+kc;
__m256 cof=_mm256_set1_ps(tempCof);
for(int c=0;c<totalSize;c+=8){
__m256 srcValue=_mm256_load_ps(a3Ptr+c);
__m256 dstValue=_mm256_load_ps(outBuffer+c);
_mm256_store_ps(outBuffer+c,_mm256_fmadd_ps(srcValue,cof,dstValue));
}
for(int c=totalSize;c<out_cols;c++){
outBuffer[c]+=a3Ptr[c]*tempCof;
}
}
}
}
}
}
测试环境
i5-10400F 6核12线程
输入64X512X512 核大小 64X3X3
输出1X510X510
结果:6ms
并不算是很高效
可作为参考