https://www.cnblogs.com/Imageshop/archive/2013/04/26/3045672.html
Huang算法 时间复杂度O(r)
https://www.cnblogs.com/yoyo-sincerely/p/6058944.html
CTMF算法 时间复杂度O(1)
论文概述
论文主要工作在于提出了一种O(1)复杂度的中值滤波算法,另外还描述了更高维度、更高数据精度和近似圆的核的中值滤波方法。
中值滤波的发展受限于其需要的处理时间,常规的优化方法不适用于这种非线性及不可分离的算法结构。简单暴力地进行中值滤波,其核心部分的算法复杂度为O(r2log r),r为核心的半径大小,其中r2用于遍历核心的每一个元素,log r用于对核心元素进行排序。当可能的像素值的数量为常数时,可以利用桶排序对其进行优化,将复杂度降至O(r2)。
Huang提出的复杂度O(r)的算法,仅需要在更新核心的过程中进行2r+1的加法和2r-1的减法,再对核心部分进行累加即可得出中值。Gil提出了基于树的O(log2r)复杂度算法,他们在同一篇论文中也阐述了O(log r)是所有二维中值滤波算法的下界。(打脸?)论文算法核心灵感来自于Huang的论文,且配套提出了4个所提出算法的优化方法。分别是1.向量化;2.针对CPU cache大小的优化;3.多级直方图;4.条件更新直方图(未理解)。实现部分代码完成了论文提到的1、3项优化方法。第2条优化方法,需要回去翻翻CPU cache的知识,以便深刻理解。第4条优化方法还没看到,后续看懂了也会再补充上来。
我会反复研读本篇论文,直到完全理解了论文提及的方方面面,并以博客的形式记录下来。
实现与优化过程
v1版本 算法中过多使用了vector、list等高级数据结构,算法有过多额外的开销,因此需要减少对其的依赖。
v2版本 直接用指针进行操作,效率上有很大的提升。
v3版本 加入了AVX2加速(优化方法1)
v4版本 加入了多级直方图优化(优化方法3)
多级直方图有效降低了时间复杂度,减少了大量重复的比较和累加计算,因此提速较为明显。越来越接近论文提供的CTMF代码的速度了,争取超越它,然后不断向opencv源码的效率靠近。
以下为v4版本部分代码
#include <opencv2\core.hpp>
#include <immintrin.h>
// AVX 256 * 2^16 histogram's addition, result save in h_kernel
void AVX_multilevel_histogram256_add_(const __m256i* col_histogram_batch, const uint16_t* coarse_col_histogram, __m256i* kernel_histogram_batch, uint16_t* coarse_kernel_histogram)
{
for (size_t i = 0; i < 16; ++i)
{
if (coarse_col_histogram[i] == 0) continue;
kernel_histogram_batch[i] = _mm256_adds_epu16(col_histogram_batch[i], kernel_histogram_batch[i]);
coarse_kernel_histogram[i] += coarse_col_histogram[i];
}
}
// AVX 256 * 2^16 histogram's subtraction, result save in h_kernel
void AVX_multilevel_histogram256_sub_(const __m256i* col_histogram_batch, const uint16_t* coarse_col_histogram, __m256i* kernel_histogram_batch, uint16_t* coarse_kernel_histogram)
{
for (size_t i = 0; i < 16; ++i)
{
if (coarse_col_histogram[i] == 0) continue;
kernel_histogram_batch[i] = _mm256_subs_epu16(kernel_histogram_batch[i], col_histogram_batch[i]);
coarse_kernel_histogram[i] -= coarse_col_histogram[i];
}
}
uchar AVX_multilevel_histogram256_median_value(const __m256i* h_kernel, const uint16_t* coarse_kernel_histogram, size_t h_amount)
{
size_t cnt = 0;
size_t batch_cnt = 0;
for (size_t batch_ind = 0; batch_ind < 16; ++batch_ind)
{
batch_cnt += coarse_kernel_histogram[batch_ind];
if (batch_cnt > h_amount / 2)
{
for (size_t ind = 0; ind < 16; ++ind)
{
cnt += h_kernel[batch_ind].m256i_u16[ind];
if (cnt > h_amount / 2)
{
return batch_ind * 16 + ind;
}
}
}
}
return 0;
}
// Description: v4版,增加多级直方图优化
//
Mat CTMFMedBlurV4(const Mat& src, size_t radius)
{
// 此处radius是半径,比如5*5半径为2
Mat result(src.size(), CV_8U);
size_t kernel_amount = (radius * 2 + 1) * (radius * 2 + 1);
// TODO: 可旋转图像防止初始化向量组过大而占用太多的空间
CV_Assert(src.rows >= radius * 2 + 1);
// 声明滤波核心的直方图
//uint16_t kernel_histogram[256];
// fine histogram
__m256i kernel_histogram[16];
memset(kernel_histogram, 0, 16 * sizeof(__m256i));
// coarse histogram
uint16_t coarse_kernel_histogram[16];
memset(coarse_kernel_histogram, 0, 16 * sizeof(uint16_t));
// 声明列向量组的直方图
// Histogram maximum value is 2^16-1
// fine histograms
vector<__m256i*> col_histograms;
col_histograms.resize(src.cols);
// coarse histograms
vector<uint16_t*> coarse_col_histograms;
coarse_col_histograms.resize(src.cols);
for (size_t ind = 0; ind < col_histograms.size(); ++ind)
{
col_histograms[ind] = (__m256i*)malloc(16 * sizeof(__m256i));
memset(col_histograms[ind], 0, 16 * sizeof(__m256i));
coarse_col_histograms[ind] = (uint16_t*)malloc(16 * sizeof(uint16_t));
memset(coarse_col_histograms[ind], 0, 16 * sizeof(uint16_t));
for (size_t row = 0; row < 2 * radius + 1; ++row)
{
++(col_histograms[ind][src.ptr<uchar>(row)[ind] / 16].m256i_u16[src.ptr<uchar>(row)[ind] % 16]);
++(coarse_col_histograms[ind][src.ptr<uchar>(row)[ind] / 16]);
}
if (ind < 2 * radius + 1)
{
AVX_multilevel_histogram256_add_(col_histograms[ind], coarse_col_histograms[ind], kernel_histogram, coarse_kernel_histogram);
}
}
result.ptr<uchar>(radius)[radius] = AVX_multilevel_histogram256_median_value(kernel_histogram, coarse_kernel_histogram, kernel_amount);
for (size_t col = radius + 1; col < col_histograms.size() - radius; ++col)
{
AVX_multilevel_histogram256_sub_(col_histograms[col - radius - 1], coarse_col_histograms[col], kernel_histogram, coarse_kernel_histogram);
AVX_multilevel_histogram256_add_(col_histograms[col + radius], coarse_col_histograms[col], kernel_histogram, coarse_kernel_histogram);
result.ptr<uchar>(radius)[col] = AVX_multilevel_histogram256_median_value(kernel_histogram, coarse_kernel_histogram, kernel_amount);
}
// Process next all rows(todo: add copy kernel_histogram)
for (size_t row = radius + 1; row < src.rows - radius; ++row)
{
memset(kernel_histogram, 0, 16 * sizeof(__m256i));
memset(coarse_kernel_histogram, 0, 16 * sizeof(uint16_t));
// (下面这里写得不够好)
for (size_t col = 0; col < col_histograms.size(); ++col)
{
if (col >= 2 * radius + 1)
{
AVX_multilevel_histogram256_sub_(col_histograms[col - 2 * radius - 1], coarse_col_histograms[col], kernel_histogram, coarse_kernel_histogram);
}
--col_histograms[col][src.ptr<uchar>(row - radius - 1)[col] / 16].m256i_u16[src.ptr<uchar>(row - radius - 1)[col] % 16];
--(coarse_col_histograms[col][src.ptr<uchar>(row - radius - 1)[col] / 16]);
++col_histograms[col][src.ptr<uchar>(row + radius)[col] / 16].m256i_u16[src.ptr<uchar>(row + radius)[col] % 16];
++(coarse_col_histograms[col][src.ptr<uchar>(row + radius)[col] / 16]);
AVX_multilevel_histogram256_add_(col_histograms[col], coarse_col_histograms[col], kernel_histogram, coarse_kernel_histogram);
if (col >= 2 * radius)
{
result.ptr<uchar>(row)[col - radius] = AVX_multilevel_histogram256_median_value(kernel_histogram, coarse_kernel_histogram, kernel_amount);
}
}
}
for (size_t ind = 0; ind < col_histograms.size(); ++ind)
{
free(col_histograms[ind]);
free(coarse_col_histograms[ind]);
}
return result;
}
文章转载请务必保留来源链接,谢谢!