均值滤波广泛的运用于图像处理,可以用来去除图片噪声。我们今天主要讲解一下什么是均值滤波,以及我们如何对原始的均值滤波进行算法层面的加速优化。
一 均值滤波的分类
均值滤波我们可以细分成4类:
1 算术均值滤波器:计算滑动窗口内像素的均值。
2 几何均值滤波器:
3 谐波均值滤波器:
4 谐谐波均值滤波器:当Q等于0时就变成均值滤波器 当Q等于-1时就是谐波均值滤波器
本章我们主要介绍算术均值滤波器,介绍算术均值滤波器的实现和优化。
二 算术均值滤波器
- 均值滤波的定义及效果
算术均值滤波器的定义很简单,我们假设滑动窗口大小是3*3, 我们使用3*3的窗口在原始输入图片上进行滑动,每滑动一次就计算滑动窗口中9个像素的均值作为目标图像像素。
Fig 1 均值滤波算法示意图
均值滤波效果:我们输入一张图片,然后使用均值滤波进行滤波,得到我们的滤波后的图像。右图是我们用5*5的滑动窗口进行均值滤波后的结果,左边是我们对应的原始图像。
Fig 2 kernel 5*5 均值滤波效果对比
- 均值滤波问题
均值滤波存在两个明显的问题
<1> 某个突变像素回影响到周围像素的值
<2> 均值滤波会导致边缘信息丢失
这两个问题可以用中值滤波得到解决,但是中值滤波计算时间消耗比较大 。边缘信息会丢失的问题提出了一种改进的均值滤波器: 首先设定一个阈值, 当中心像素点的像素值与平均值差的绝对值大于阈值的时候才做滤波处理。这样既可以对噪声进行平滑,图像细节上的损失也很小。
三 均值滤波的实现及优化
本文的重点是介绍均值滤波的实现以及算法层面的优化,相应的会给出均值滤波不同优化版本的C代码实现。均值滤波的计算主要分为两个部分,一个是加法运算,一个是除法运算。我们优化的目标就是减少加法和除法运算的次数。我们是基于5*5的均值滤波为例进行算法层面的优化。
(1)方案一:按照均值滤波定义的方式来实现均值滤波,如Fig 1所示。
static void BlurU8_5x5_C(DdrPoint* pDdrPoint, BlurU8Param* pParam)
{
float sum = 0.0;
int dstChannel = pParam->dstChannel;
int dstWidth = pParam->dstWidth;
int dstHeight = pParam->dstHeight;
int dstStride = pParam->dstStride;
int srcWidth = pParam->srcWidth;
int srcHeight = pParam->srcHeight;
int srcStride = pParam->srcStride;
int wSizeWidth = pParam->wSizeWidth;
int wSizeHeight = pParam->wSizeHeight;
int kSize = wSizeWidth * wSizeHeight;
for (int k = 0; k < dstChannel; ++k)
{
int srcWidthTmp = srcWidth + (wSizeWidth / 2) * 2;
int srcHeightTmp = srcHeight + (wSizeHeight / 2) * 2;
unsigned char* pSrcTmpU8 = pDdrPoint->pSrcU8 + k * srcWidth * srcHeight;
unsigned char* pDstTmpU8 = pDdrPoint->pDstU8 + k * dstWidth * dstHeight;
unsigned char* pTmpBuffer = (unsigned char*)malloc(srcWidthTmp * srcHeightTmp);
memset(pTmpBuffer, 0, srcWidthTmp * srcHeightTmp);
unsigned char* pSrcTmpBuffer = pTmpBuffer + srcWidthTmp + (wSizeWidth / 2);
for (int i = 0; i < srcHeight; ++i)
{
memcpy((pSrcTmpBuffer + i * srcWidthTmp), (pSrcTmpU8 + i * srcWidth), srcWidth);
}
for (int r = 0; r < dstHeight; ++r)
{
for (int c = 0; c < dstWidth; ++c)
{
sum = 0.0;
// SUM
for (int i = r - wSizeHeight / 2; i <= r + wSizeHeight / 2; ++i)
{
for (int j = c - wSizeWidth / 2; j <= c + wSizeWidth / 2; ++j)
{
sum += *(pSrcTmpBuffer + i * srcWidthTmp + j);
}
}
// AVE STORE
*(pDstTmpU8 + r * dstStride + c) = (int)((sum + kSize / 2) / kSize);
}
}
free(pTmpBuffer);
}
}
这样的实现方式可以实现均值滤波的效果但是效率上是有问题的,可以发现这样实现方案,存在着大量的重复计算的过程。当滑动窗口进行滑动的时候,相邻两个窗口之间存在着大量重叠的像素,该版本的均值滤波对这部分的代码做了重复计算。针对这个问题我们推出版本2的均值滤波。
(2)方案二:我们可以发现当滑动窗口从左向右滑动的时候,每一次滑动前后两个窗口之间,像素集合只改变了左右两列元素。所以我们可以将窗口像素和的结果保存在一个变量中,每次滑动时候减去最左侧移除的像素值,加上最右侧新加入的像素值就可以。
Fig 3 方案2算法示意图
static void BlurU8_5x5_C(DdrPoint* pDdrPoint, BlurU8Param* pParam)
{
float sum = 0.0;
int dstChannel = pParam->dstChannel;
int dstWidth = pParam->dstWidth;
int dstHeight = pParam->dstHeight;
int dstStride = pParam->dstStride;
int srcWidth = pParam->srcWidth;
int srcHeight = pParam->srcHeight;
int srcStride = pParam->srcStride;
int wSizeWidth = pParam->wSizeWidth;
int wSizeHeight = pParam->wSizeHeight;
int kSize = wSizeWidth * wSizeHeight;
for (int k = 0; k < dstChannel; ++k)
{
int srcWidthTmp = srcWidth + (wSizeWidth / 2) * 2;
int srcHeightTmp = srcHeight + (wSizeHeight / 2) * 2;
unsigned char* pSrcTmpU8 = pDdrPoint->pSrcU8 + k * srcWidth * srcHeight;
unsigned char* pDstTmpU8 = pDdrPoint->pDstU8 + k * dstWidth * dstHeight;
unsigned char* pTmpBuffer = (unsigned char*)malloc(srcWidthTmp * srcHeightTmp);
memset(pTmpBuffer, 0, srcWidthTmp * srcHeightTmp);
unsigned char* pSrcTmpBuffer = pTmpBuffer + (wSizeHeight / 2) * srcWidthTmp + (wSizeWidth / 2);
for (int i = 0; i < srcHeight; ++i)
{
memcpy((pSrcTmpBuffer + i * srcWidthTmp), (pSrcTmpU8 + i * srcWidth), srcWidth);
}
for (int r = 0; r < dstHeight; ++r)
{
sum = 0.0;
sum += *(pSrcTmpBuffer + r * srcWidthTmp - 2*srcWidthTmp - 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - 2*srcWidthTmp - 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - 2*srcWidthTmp);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - 2*srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - 2*srcWidthTmp + 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - srcWidthTmp - 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - srcWidthTmp - 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - srcWidthTmp);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - srcWidthTmp + 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp - 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + srcWidthTmp - 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + srcWidthTmp - 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + srcWidthTmp);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + srcWidthTmp + 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + 2*srcWidthTmp - 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + 2*srcWidthTmp - 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + 2*srcWidthTmp);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + 2*srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + 2*srcWidthTmp + 2);
*(pDstTmpU8 + r * dstStride) = (int)((sum + kSize / 2) / kSize);
for (int c = 1; c < dstWidth; ++c)
{
// SUM
sum -= *(pSrcTmpBuffer + r * srcWidthTmp + c - 2);
sum -= *(pSrcTmpBuffer + r * srcWidthTmp + c - srcWidthTmp - 2);
sum -= *(pSrcTmpBuffer + r * srcWidthTmp + c + srcWidthTmp - 2);
sum -= *(pSrcTmpBuffer + r * srcWidthTmp + c - 2*srcWidthTmp - 2);
sum -= *(pSrcTmpBuffer + r * srcWidthTmp + c + 2*srcWidthTmp - 2);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + c + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + c - srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + c + srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + c - 2*srcWidthTmp + 1);
sum += *(pSrcTmpBuffer + r * srcWidthTmp + c + 2*srcWidthTmp + 1);
// AVE STORE
*(pDstTmpU8 + r * dstStride + c) = (int)((sum + kSize / 2) / kSize);
}
}
free(pTmpBuffer);
}
}
(3) 方案3:经过方案2的优化之后,我们发现相比方案1我们的效率有所提升,但是单独就滑动窗口左右两侧的像素计算还是存在重复。所以我们改进了一下,减少左右两侧像素的重复计算。相比方案2我们将为图像的每一列申请一个缓存buf, 用于存储列方向的累加和。计算均值时首先是行方向的累加,就是将相应的5个buf累加。列方向移动时,需要更新buf内的值。
Fig 4 方案3算法示意图
static void BlurU8_5x5_C(DdrPoint* pDdrPoint, BlurU8Param* pParam)
{
float sum = 0.0;
int dstChannel = pParam->dstChannel;
int dstWidth = pParam->dstWidth;
int dstHeight = pParam->dstHeight;
int dstStride = pParam->dstStride;
int srcWidth = pParam->srcWidth;
int srcHeight = pParam->srcHeight;
int srcStride = pParam->srcStride;
int wSizeWidth = pParam->wSizeWidth;
int wSizeHeight = pParam->wSizeHeight;
int kSize = wSizeWidth * wSizeHeight;
for (int k = 0; k < dstChannel; ++k)
{
int srcWidthTmp = srcWidth + (wSizeWidth / 2) * 2;
int srcHeightTmp = srcHeight + (wSizeHeight / 2) * 2;
int* sumTmp = (int*)malloc(srcWidthTmp * sizeof(int));
unsigned char* pSrcTmpU8 = pDdrPoint->pSrcU8 + k * srcWidth * srcHeight;
unsigned char* pDstTmpU8 = pDdrPoint->pDstU8 + k * dstWidth * dstHeight;
unsigned char* pTmpBuffer = (unsigned char*)malloc(srcWidthTmp * srcHeightTmp);
memset(pTmpBuffer, 0, srcWidthTmp * srcHeightTmp);
unsigned char* pSrcTmpBuffer = pTmpBuffer + (wSizeHeight / 2) * srcWidthTmp;
for (int i = 0; i < srcHeight; ++i)
{
memcpy((pSrcTmpBuffer + (wSizeWidth / 2) + i * srcWidthTmp), (pSrcTmpU8 + i * srcWidth), srcWidth);
}
memset(sumTmp, 0, srcWidthTmp * sizeof(int));
for (int i = 0; i < srcWidthTmp; ++i)
{
sumTmp[i] += *(pSrcTmpBuffer - 2 * srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer - srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer + i);
sumTmp[i] += *(pSrcTmpBuffer + srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer + 2 * srcWidthTmp + i);
}
for (int r = 0; r < dstHeight; ++r)
{
int index = 2;
sum = 0.0;
sum = sumTmp[0] + sumTmp[1] + sumTmp[2] + sumTmp[3] + sumTmp[4];
*(pDstTmpU8 + r * dstStride) = (int)((sum + kSize / 2) / kSize);
for (int c = 1; c < dstWidth; ++c)
{
int index = (wSizeWidth / 2) + c - 3;
SUM
sum -= sumTmp[index];
sum += sumTmp[index+5];
// AVE STORE
*(pDstTmpU8 + r * dstStride + c) = (int)((sum + kSize / 2) / kSize);
}
for (int i = 0; i < srcWidthTmp; ++i)
{
sumTmp[i] -= *(pSrcTmpBuffer + r * srcWidthTmp - 2 * srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer + r * srcWidthTmp + 3 * srcWidthTmp + i);
}
}
free(pTmpBuffer);
}
}
(4) 方案4:经过上述上个步骤我们已经对均值滤波优化有了一个大幅度的提升,以上三个部分是针对加法部分优化,适用于任何情况。如果我们的图片输入输出类型都是8bit, 我们还可以对我们的均值滤波中的除法运算进行优化。我们可以提前建立一张表,用于存储除法所有可能的结果,然后就可以将除法运算改成查找表,实现性能上的提升。
int divisionRes[255*25];
static void BlurU8_5x5_C(DdrPoint* pDdrPoint, BlurU8Param* pParam)
{
int res = -1;
for (int i = 0; i < 255 * 25; ++i)
{
if (i % 25 == 0)
{
++res;
}
divisionRes[i] = res;
}
float sum = 0.0;
int dstChannel = pParam->dstChannel;
int dstWidth = pParam->dstWidth;
int dstHeight = pParam->dstHeight;
int dstStride = pParam->dstStride;
int srcWidth = pParam->srcWidth;
int srcHeight = pParam->srcHeight;
int srcStride = pParam->srcStride;
int wSizeWidth = pParam->wSizeWidth;
int wSizeHeight = pParam->wSizeHeight;
int kSize = wSizeWidth * wSizeHeight;
for (int k = 0; k < dstChannel; ++k)
{
int srcWidthTmp = srcWidth + (wSizeWidth / 2) * 2;
int srcHeightTmp = srcHeight + (wSizeHeight / 2) * 2;
int* sumTmp = (int*)malloc(srcWidthTmp * sizeof(int));
unsigned char* pSrcTmpU8 = pDdrPoint->pSrcU8 + k * srcWidth * srcHeight;
unsigned char* pDstTmpU8 = pDdrPoint->pDstU8 + k * dstWidth * dstHeight;
unsigned char* pTmpBuffer = (unsigned char*)malloc(srcWidthTmp * srcHeightTmp);
memset(pTmpBuffer, 0, srcWidthTmp * srcHeightTmp);
unsigned char* pSrcTmpBuffer = pTmpBuffer + (wSizeHeight / 2) * srcWidthTmp;
for (int i = 0; i < srcHeight; ++i)
{
memcpy((pSrcTmpBuffer + (wSizeWidth / 2) + i * srcWidthTmp), (pSrcTmpU8 + i * srcWidth), srcWidth);
}
memset(sumTmp, 0, srcWidthTmp * sizeof(int));
for (int i = 0; i < srcWidthTmp; ++i)
{
sumTmp[i] += *(pSrcTmpBuffer - 2 * srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer - srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer + i);
sumTmp[i] += *(pSrcTmpBuffer + srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer + 2 * srcWidthTmp + i);
}
for (int r = 0; r < dstHeight; ++r)
{
int index = 2;
sum = 0.0;
sum = sumTmp[0] + sumTmp[1] + sumTmp[2] + sumTmp[3] + sumTmp[4];
*(pDstTmpU8 + r * dstStride + c) = divisionRes[int(sum + kSize / 2)];
for (int c = 1; c < dstWidth; ++c)
{
int index = (wSizeWidth / 2) + c - 3;
SUM
sum -= sumTmp[index];
sum += sumTmp[index+5];
// AVE STORE
*(pDstTmpU8 + r * dstStride + c) = divisionRes[int(sum + kSize / 2)];
}
for (int i = 0; i < srcWidthTmp; ++i)
{
sumTmp[i] -= *(pSrcTmpBuffer + r * srcWidthTmp - 2 * srcWidthTmp + i);
sumTmp[i] += *(pSrcTmpBuffer + r * srcWidthTmp + 3 * srcWidthTmp + i);
}
}
free(pTmpBuffer);
}
}
关于均值滤波的NEON优化,详见下一篇《均值滤波NEON实现及优化》。