前言
Harris 响应值可以判断图像上的点是否“像一个角点”,通常用于角点检测,有时候也会在特征提取/匹配/跟踪算法中用于评估特征点质量的好坏,因为角点响应值大的点往往更适合特征匹配或者特征跟踪,OpenCV 的 ORB 检测器就是这样的。
具体原理就不细说了,可以参见:
归结到公式就是:
c
o
v
(
x
,
y
)
=
∑
(
u
,
v
)
∈
W
(
x
,
y
)
w
(
u
,
v
)
[
I
u
2
I
u
I
v
I
u
I
v
I
v
2
]
R
=
d
e
t
(
c
o
v
)
−
k
∗
t
r
a
c
e
(
c
o
v
)
cov(x,y)=\sum_{(u,v)\in W(x,y)}{w(u,v)\begin{bmatrix} I_u^2 & I_uI_v \\ I_uI_v & I_v^2 \end{bmatrix}} \\ R=det(cov)-k*trace(cov)
cov(x,y)=(u,v)∈W(x,y)∑w(u,v)[Iu2IuIvIuIvIv2]R=det(cov)−k∗trace(cov)
其中
I
u
I_u
Iu 和
I
v
I_v
Iv 分别是 x 和 y 方向的梯度,OpenCV 中使用 Sobel 或者 Scharr 计算。
c
o
v
cov
cov 矩阵在目标点邻域
W
(
x
,
y
)
W(x,y)
W(x,y) 进行了平均,OpenCV 中直接使用 boxFilter 进行均值滤波。最终的响应值由
c
o
v
cov
cov 矩阵的行列式和迹算得,参见 corner.cpp。
计算稀疏 Harris 响应
OpenCV 对外暴露的 Harris 响应计算函数为 cornerHarris()
,这个函数是计算全图每个点响应值的,如果我们只想计算少量点(比如数百个)的角点响应,用它显然就太浪费时间了。为此,OpenCV 在 orb.cpp 中实现了用于计算稀疏点的 HarrisResponses()
,不过该函数是静态的,并未对外暴露。整理后它的实现代码为:
void HarrisResponses(InputArray image, InputArray pts, OutputArray responses, int blockSize = 7, double k = 0.04)
{
CV_Assert(image.type() == CV_8UC1 && image.isContinuous());
CV_Assert(pts.type() == CV_32FC2 && pts.rows() == 1);
CV_Assert(responses.type() == CV_32FC1);
CV_Assert(blockSize > 0 && blockSize < 45);
Mat img = image.getMat();
Mat _mpts = pts.getMat();
Point2f* _pts = _mpts.ptr<Point2f>();
responses.create(1, pts.cols(), CV_32FC1);
Mat dst = responses.getMat();
float* res = dst.ptr<float>();
const uchar* ptr00 = img.ptr<uchar>();
int step = img.cols;
int r = blockSize / 2;
// 1 << (3 - 1): Sobel normalize factor
// 255: gray scale normalize factor
float scale = 1.f / ((1 << (3 - 1)) * 255.f * blockSize);
scale = scale * scale * scale * scale;
AutoBuffer<int> ofs(blockSize * blockSize);
for (int i = 0; i < blockSize; ++i)
for (int j = 0; j < blockSize; ++j)
ofs[i * blockSize + j] = i * step + j;
for (int idx = 0; idx < pts.cols(); ++idx) {
int x0 = cvRound(_pts[idx].x) - r;
int y0 = cvRound(_pts[idx].y) - r;
if (x0 < 1 || x0 + blockSize > img.cols - 2
|| y0 < 1 || y0 + blockSize > img.rows - 2) {
res[idx] = 0.f;
continue;
}
const uchar* ptr0 = ptr00 + y0 * step + x0;
float a = 0.f, b = 0.f, c = 0.f;
for (size_t i = 0; i < ofs.size(); ++i) {
const uchar* ptr = ptr0 + ofs[i];
// Sobel 3x3
float Ix = ((ptr[1] - ptr[-1]) << 1)
+ (ptr[-step + 1] - ptr[-step - 1])
+ (ptr[step + 1] - ptr[step - 1]);
float Iy = ((ptr[step] - ptr[-step]) << 1)
+ (ptr[1 + step] - ptr[1 - step])
+ (ptr[-1 + step] - ptr[-1 - step]);
// sum up in blockSize area
a += Ix * Ix;
b += Iy * Iy;
c += Ix * Iy;
}
// R = det(cov) - k * trace(cov)
res[idx] = (a * b - c * c - k * (a + b) * (a + b)) * scale;
}
}
SSE 加速优化
参考 ICE-BA 中的 neon 优化版本,我写了一个 SSE 优化版,实测经 sse2neon 转化后,在 aarch64 平台上速度是优于 ICE-BA 版本的:
void HarrisResponsesSSE(InputArray image, InputArray pts, OutputArray responses, int blockSize = 7, double k = 0.04)
{
CV_Assert(image.type() == CV_8UC1);
CV_Assert(pts.type() == CV_32FC2 && pts.rows() == 1);
CV_Assert(responses.type() == CV_32FC1);
CV_Assert(blockSize >= 7 && blockSize <= 9);
Mat img = image.getMat();
Mat _mpts = pts.getMat();
Point2f* _pts = _mpts.ptr<Point2f>();
responses.create(1, pts.cols(), CV_32FC1);
Mat dst = responses.getMat();
float* res = dst.ptr<float>();
__m128i raw_buffer[3];
size_t ptidx, ptsize = pts.cols();
const uchar* ptr00 = img.ptr<uchar>();
int step = static_cast<int>(img.step / img.elemSize1());
int r = blockSize / 2;
float scale = (1 << 2) * blockSize * 255.0f;
scale = 1.0f / scale;
float scale_sq_sq = scale * scale * scale * scale;
__m128i* row_top = &raw_buffer[0];
__m128i* row_mid = &raw_buffer[1];
__m128i* row_bot = &raw_buffer[2];
__m128i* tmp_ptr = nullptr;
for (ptidx = 0; ptidx < ptsize; ptidx++) {
int a = 0, b = 0, c = 0;
int x0 = cvRound(_pts[ptidx].x - r);
int y0 = cvRound(_pts[ptidx].y - r);
if (x0 < 1 || x0 + blockSize > img.cols - 2
|| y0 < 1 || y0 + blockSize > img.rows - 2) {
res[ptidx] = 0.f;
continue;
}
const uchar* ptr0 = ptr00 + y0 * step + x0;
*row_mid = _mm_loadu_si128((__m128i*)(ptr0 - step - 1)); // top row
*row_bot = _mm_loadu_si128((__m128i*)(ptr0 - 1)); // middle row
__m128i va = _mm_setzero_si128();
__m128i vb = _mm_setzero_si128();
__m128i vc = _mm_setzero_si128();
auto accumulateCoeffs = [&va, &vb, &vc, blockSize](const __m128i top_row,
const __m128i mid_row, const __m128i bot_row) {
__m128i ix, iy;
__m128i row_top_lo, row_top_hi;
__m128i row_bot_lo, row_bot_hi;
row_top_lo = _mm_unpacklo_epi8(top_row, _mm_setzero_si128());
row_top_hi = _mm_unpackhi_epi8(top_row, _mm_setzero_si128());
row_bot_lo = _mm_unpacklo_epi8(bot_row, _mm_setzero_si128());
row_bot_hi = _mm_unpackhi_epi8(bot_row, _mm_setzero_si128());
__m128i diff0 = _mm_sub_epi16(row_bot_lo, row_top_lo);
__m128i diff_hi = _mm_sub_epi16(row_bot_hi, row_top_hi);
__m128i diff1 = _mm_slli_epi16(_mm_alignr_epi8(diff_hi, diff0, 2), 1);
__m128i diff2 = _mm_alignr_epi8(diff_hi, diff0, 4);
iy = _mm_add_epi16(_mm_add_epi16(diff0, diff2), diff1);
diff0 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(top_row, 2)), row_top_lo);
diff1 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(mid_row, 2)),
_mm_unpacklo_epi8(mid_row, _mm_setzero_si128()));
diff1 = _mm_slli_epi16(diff1, 1);
diff2 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(bot_row, 2)), row_bot_lo);
ix = _mm_add_epi16(diff1, _mm_add_epi16(diff0, diff2));
// set the last lane to zero
if (blockSize == 7) {
__m128i mask = _mm_srli_si128(_mm_set1_epi8(0xFF), 2);
ix = _mm_and_si128(ix, mask);
iy = _mm_and_si128(iy, mask);
}
__m128i ix_lo, ix_hi;
__m128i iy_lo, iy_hi;
ix_lo = _mm_cvtepi16_epi32(ix);
ix_hi = _mm_cvtepi16_epi32(_mm_srli_si128(ix, 8));
iy_lo = _mm_cvtepi16_epi32(iy);
iy_hi = _mm_cvtepi16_epi32(_mm_srli_si128(iy, 8));
va = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(ix_lo, ix_lo), _mm_mullo_epi32(ix_hi, ix_hi)), va);
vb = _mm_add_epi32(vb, _mm_add_epi32(_mm_mullo_epi32(iy_lo, iy_lo), _mm_mullo_epi32(iy_hi, iy_hi)));
vc = _mm_add_epi32(vc, _mm_add_epi32(_mm_mullo_epi32(ix_lo, iy_lo), _mm_mullo_epi32(ix_hi, iy_hi)));
}; //- accumulateCoeffs
for (int _k = 0; _k < blockSize; _k++, ptr0 += step) {
// shifting
tmp_ptr = row_top;
row_top = row_mid;
row_mid = row_bot;
row_bot = tmp_ptr;
*row_bot = _mm_loadu_si128((__m128i*)(ptr0 + step - 1)); // bottom row
accumulateCoeffs(*row_top, *row_mid, *row_bot);
if (blockSize == 9) {
const uchar* rtop_ptr = reinterpret_cast<const uchar*>(row_top);
const uchar* rmid_ptr = reinterpret_cast<const uchar*>(row_mid);
const uchar* rbot_ptr = reinterpret_cast<const uchar*>(row_bot);
int ix = (rmid_ptr[10] - rmid_ptr[8]) * 2 + (rtop_ptr[10] - rtop_ptr[8]) + (rbot_ptr[10] - rbot_ptr[8]);
int iy = (rbot_ptr[9] - rtop_ptr[9]) * 2 + (rbot_ptr[8] - rtop_ptr[8]) + (rbot_ptr[10] - rtop_ptr[10]);
a += ix * ix;
b += iy * iy;
c += ix * iy;
}
}
auto hsumEpi32 = [](__m128i x) -> int { // sum up all elements as int32
__m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sum64 = _mm_add_epi32(hi64, x);
__m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sum32 = _mm_add_epi32(sum64, hi32);
return _mm_cvtsi128_si32(sum32);
};
a += hsumEpi32(va);
b += hsumEpi32(vb);
c += hsumEpi32(vc);
res[ptidx] = scale_sq_sq * (static_cast<float>(a) * b - static_cast<float>(c) * c
- k * (static_cast<float>(a) + b) * (static_cast<float>(a) + b));
}
}
速度测试
测试图像(来自 TUM 数据集):
测试代码:
double benchmark(size_t samples, size_t iterations, const function<void()>& op)
{
using namespace std::chrono;
auto now = [] { return high_resolution_clock::now(); };
samples = max(1ul, samples);
iterations = max(1ul, iterations);
double best_time = numeric_limits<double>::infinity();
for (size_t s = 0; s < samples; ++s) {
auto t1 = now();
for (size_t i = 0; i < iterations; ++i)
op();
best_time = min(best_time, duration<double>(now() - t1).count());
}
best_time = best_time / iterations * 1e3;
return best_time;
}
int main(int argc, char* argv[])
{
string fn_src = argc > 1 ? argv[1] : "../scene.jpg";
Mat img_src = imread(fn_src, IMREAD_GRAYSCALE);
if (img_src.empty()) {
cout << "cannot load image: " << fn_src << endl;
return 0;
}
cout << "image size: " << img_src.cols << "*" << img_src.rows << endl;
const int block_size = 7;
const double k = 0.04;
const int boundary = (block_size / 2) + 2;
vector<Point2f> pts;
for (int y = boundary; y < img_src.rows - boundary; ++y)
for (int x = boundary; x < img_src.cols - boundary; ++x)
pts.emplace_back(x, y);
cout << "num of pts: " << pts.size() << endl;
// benchmark
vector<float> result_normal, result_sse;
double time_normal = benchmark(3, 10, [&] {
HarrisResponses(img_src, pts, result_normal, block_size, k);
});
cout << "time normal: " << time_normal << " ms" << endl;
double time_sse = benchmark(3, 10, [&] {
HarrisResponsesSSE(img_src, pts, result_sse, block_size, k);
});
cout << "time SSE: " << time_sse << endl;
cout << "speed up: " << time_normal / time_sse << " ms" << endl;
// compare results
for (size_t i = 0; i < pts.size(); ++i) {
float err = abs(result_normal[i] - result_sse[i]);
if (err > 1e6) {
auto pt = pts[i];
cout << format("error too large: pt[%f, %f] err[%f]", pt.x, pt.y, err) << endl;
break;
}
}
return 0;
}
在本人 i7-8700 CPU 上测试结果为:
image size: 640*480
num of pts: 296100
time normal: 41.9455 ms
time SSE: 14.3351
speed up: 2.92607 ms
提速约为 3 倍。