前言
均值滤波的几种优化方法,包括neon intrinsic方法。
一、均值滤波
平滑滤波是低频增强的空间域滤波技术。它的目的有两类:一类是模糊;另一类是消除噪音。
二、使用步骤
1. 原始像素的遍历
遍历每个像素,其时间复杂度是 O( height x width x (radius x 2 + 1) x (radius x 2 + 1) )。
void BoxFilter::filter(float *input, int radius, int height, int width, float *output) {
for (int h = 0; h < height; ++h) {
int height_sift = h * width;
for (int w = 0; w < width; ++w) {
//边界的处理
int start_h = std::max(0, h - radius);
int end_h = std::min(height - 1, h + radius);
int start_w = std::max(0, w - radius);
int end_w = std::min(width - 1, w + radius);
//单位sub半径范围sum
float tmp = 0;
for (int sh = start_h; sh <= end_h; ++sh) {
for (int sw = start_w; sw <= end_w; ++ sw) {
//逐行的step
tmp += input[sh * width + sw];
}
}
output[height_sift + w] = tmp;
}
}
}
2. 行列的分离
首先计算行方向上的和,然后再计算列的和。复杂度O(height x width x (radius x 2 + 1) x 2)
void BoxFilter::fastFilter(float *input, int radius, int height, int width, float *output) {
float *cachePtr = &(cache[0]);
// sum horizonal
for (int h = 0; h < height; ++h) {
int sift = h * width;
for (int w = 0; w < width; ++w) {
int start_w = std::max(0, w - radius);
int end_w = std::min(width - 1, w + radius);
//水平方向求和
float tmp = 0;
for (int sw = start_w; sw <= end_w; ++ sw) {
tmp += input[sift + sw];
}
cachePtr[sift + w] = tmp;
}
}
// sum vertical
for (int h = 0; h < height; ++h) {
int shift = h * width;
int start_h = std::max(0, h - radius);
int end_h = std::min(height - 1, h + radius);
//列半径内的和
for (int sh = start_h; sh <= end_h; ++sh) {
int out_shift = sh * width;
for (int w = 0; w < width; ++w) {
output[out_shift + w] += cachePtr[shift + w];
}
}
}
}
3. 头尾相减
把前一个元素半径内的和,按窗口偏移之后减去旧的点和加上新加入的点即可。复杂度O(height x width x 2 x 2)
//把前一个元素半径内的和,按窗口偏移之后减去旧的点和加上新加入的点即可
void BoxFilter::fastFilterV2(float *input, int radius, int height, int width, float *output) {
//cache.resize(height * width);
float *cachePtr = &(cache[0]);
// sum horizonal
for (int h = 0; h < height; ++h) {
int shift = h * width;
float tmp = 0;
//水平求和, 开始位置
for (int w = 0; w < radius; ++w) {
tmp += input[shift + w];
}
for (int w = 0; w <= radius; ++w) {
tmp += input[shift + w + radius];
cachePtr[shift + w] = tmp;
}
//在水平方向, 中间位置一次加法然后一次减法
int start = radius + 1;
int end = width - 1 - radius;
for (int w = start; w <= end; ++w) {
tmp += input[shift + w + radius];
tmp -= input[shift + w - radius - 1];
cachePtr[shift + w] = tmp;
}
//水平求和, 结束位置
start = width - radius;
for (int w = start; w < width; ++w) {
tmp -= input[shift + w - radius - 1];
cachePtr[shift + w] = tmp;
}
}
//每一列对应点的和
float *colSumPtr = &(colSum[0]);//colSum.resize(width);
for (int indexW = 0; indexW < width; ++indexW) {
colSumPtr[indexW] = 0;
}
// sum vertical
for (int h = 0; h < radius; ++h) {
int shift = h * width;
for (int w = 0; w < width; ++w) {
colSumPtr[w] += cachePtr[shift + w];
}
}
//竖直方向, 开始部分
for (int h = 0; h <= radius; ++h) {
float *addPtr = cachePtr + (h + radius) * width;
int shift = h * width;
float *outPtr = output + shift;
for (int w = 0; w < width; ++w) {
colSumPtr[w] += addPtr[w];
outPtr[w] = colSumPtr[w];
}
}
//竖直方向, middle部分
int start = radius + 1;
int end = height - 1 - radius;
for (int h = start; h <= end; ++h) {
float *addPtr = cachePtr + (h + radius) * width;
float *subPtr = cachePtr + (h - radius - 1) * width;
int shift = h * width;
float *outPtr = output + shift;
for (int w = 0; w < width; ++w) {
colSumPtr[w] += addPtr[w];
colSumPtr[w] -= subPtr[w];
outPtr[w] = colSumPtr[w];
}
}
//列的结束
start = height - radius;
for (int h = start; h < height; ++h) {
float *subPtr = cachePtr + (h - radius - 1) * width;
int shift = h * width;
float *outPtr = output + shift;
for (int w = 0; w < width; ++w) {
colSumPtr[w] -= subPtr[w];
outPtr[w] = colSumPtr[w];
}
}
}
4. intrinsic方法
在行方向上由于相邻元素有依赖关系,因此是无法并行的,所以我们可以在列方向上使用Neon Intrinsics来并行处理数据。
void BoxFilter::fastFilterV2NeonIntrinsics(float *input, int radius, int height, int width, float *output) {
int Block = Width >> 2;
int Remain = Width - (Block << 2);
//Origin
// for(int Y = 0; Y < Radius; Y++){
// int Stride = Y * Width;
// for(int X = 0; X < Width; X++){
// colsumPtr[X] += colsumPtr[Stride + X];
// }
// }
for(int Y = 0; Y < Radius; Y++){
int Stride = Y * Width;
float* tmpColSumPtr = colsumPtr;
float* tmpCachePtr = cachePtr;
int n = Block;
int re = Remain;
//the 4 block
for(; n > 0; n--){
float32x4_t colsum = vld1q_f32(tmpColSumPtr);
float32x4_t cache = vld1q_f32(tmpCachePtr);
float32x4_t sum = vaddq_f32(colsum, cache);
vst1q_f32(tmpColSumPtr, sum);
tmpColSumPtr += 4;
tmpCachePtr += 4;
}
//the remain
for (; re > 0; re--) {
*tmpColSumPtr += *tmpCachePtr;
tmpColSumPtr ++;
tmpCachePtr ++;
}
}
}
总结
传统的方法是4个add的复杂度。