稀疏 Harris 响应的 C++ 实现及 SSE 加速

前言

Harris 响应值可以判断图像上的点是否“像一个角点”,通常用于角点检测,有时候也会在特征提取/匹配/跟踪算法中用于评估特征点质量的好坏,因为角点响应值大的点往往更适合特征匹配或者特征跟踪,OpenCV 的 ORB 检测器就是这样的。

具体原理就不细说了,可以参见:

归结到公式就是:
c o v ( x , y ) = ∑ ( u , v ) ∈ W ( x , y ) w ( u , v ) [ I u 2 I u I v I u I v I v 2 ] R = d e t ( c o v ) − k ∗ t r a c e ( c o v ) cov(x,y)=\sum_{(u,v)\in W(x,y)}{w(u,v)\begin{bmatrix} I_u^2 & I_uI_v \\ I_uI_v & I_v^2 \end{bmatrix}} \\ R=det(cov)-k*trace(cov) cov(x,y)=(u,v)W(x,y)w(u,v)[Iu2IuIvIuIvIv2]R=det(cov)ktrace(cov)
其中 I u I_u Iu I v I_v Iv 分别是 x 和 y 方向的梯度,OpenCV 中使用 Sobel 或者 Scharr 计算。 c o v cov cov 矩阵在目标点邻域 W ( x , y ) W(x,y) W(x,y) 进行了平均,OpenCV 中直接使用 boxFilter 进行均值滤波。最终的响应值由 c o v cov cov 矩阵的行列式和迹算得,参见 corner.cpp

计算稀疏 Harris 响应

OpenCV 对外暴露的 Harris 响应计算函数为 cornerHarris(),这个函数是计算全图每个点响应值的,如果我们只想计算少量点(比如数百个)的角点响应,用它显然就太浪费时间了。为此,OpenCV 在 orb.cpp 中实现了用于计算稀疏点的 HarrisResponses(),不过该函数是静态的,并未对外暴露。整理后它的实现代码为:

void HarrisResponses(InputArray image, InputArray pts, OutputArray responses, int blockSize = 7, double k = 0.04)
{
    CV_Assert(image.type() == CV_8UC1 && image.isContinuous());
    CV_Assert(pts.type() == CV_32FC2 && pts.rows() == 1);
    CV_Assert(responses.type() == CV_32FC1);
    CV_Assert(blockSize > 0 && blockSize < 45);

    Mat img = image.getMat();
    Mat _mpts = pts.getMat();
    Point2f* _pts = _mpts.ptr<Point2f>();
    responses.create(1, pts.cols(), CV_32FC1);
    Mat dst = responses.getMat();
    float* res = dst.ptr<float>();

    const uchar* ptr00 = img.ptr<uchar>();
    int step = img.cols;
    int r = blockSize / 2;

    // 1 << (3 - 1): Sobel normalize factor
    // 255: gray scale normalize factor
    float scale = 1.f / ((1 << (3 - 1)) * 255.f * blockSize);
    scale = scale * scale * scale * scale;

    AutoBuffer<int> ofs(blockSize * blockSize);
    for (int i = 0; i < blockSize; ++i)
        for (int j = 0; j < blockSize; ++j)
            ofs[i * blockSize + j] = i * step + j;

    for (int idx = 0; idx < pts.cols(); ++idx) {
        int x0 = cvRound(_pts[idx].x) - r;
        int y0 = cvRound(_pts[idx].y) - r;
        if (x0 < 1 || x0 + blockSize > img.cols - 2
            || y0 < 1 || y0 + blockSize > img.rows - 2) {
            res[idx] = 0.f;
            continue;
        }
        const uchar* ptr0 = ptr00 + y0 * step + x0;
        float a = 0.f, b = 0.f, c = 0.f;

        for (size_t i = 0; i < ofs.size(); ++i) {
            const uchar* ptr = ptr0 + ofs[i];
            // Sobel 3x3
            float Ix = ((ptr[1] - ptr[-1]) << 1)
                + (ptr[-step + 1] - ptr[-step - 1])
                + (ptr[step + 1] - ptr[step - 1]);
            float Iy = ((ptr[step] - ptr[-step]) << 1)
                + (ptr[1 + step] - ptr[1 - step])
                + (ptr[-1 + step] - ptr[-1 - step]);
            // sum up in blockSize area
            a += Ix * Ix;
            b += Iy * Iy;
            c += Ix * Iy;
        }
        // R = det(cov) - k * trace(cov)
        res[idx] = (a * b - c * c - k * (a + b) * (a + b)) * scale;
    }
}

SSE 加速优化

参考 ICE-BA 中的 neon 优化版本,我写了一个 SSE 优化版,实测经 sse2neon 转化后,在 aarch64 平台上速度是优于 ICE-BA 版本的:

void HarrisResponsesSSE(InputArray image, InputArray pts, OutputArray responses, int blockSize = 7, double k = 0.04)
{
    CV_Assert(image.type() == CV_8UC1);
    CV_Assert(pts.type() == CV_32FC2 && pts.rows() == 1);
    CV_Assert(responses.type() == CV_32FC1);
    CV_Assert(blockSize >= 7 && blockSize <= 9);

    Mat img = image.getMat();
    Mat _mpts = pts.getMat();
    Point2f* _pts = _mpts.ptr<Point2f>();
    responses.create(1, pts.cols(), CV_32FC1);
    Mat dst = responses.getMat();
    float* res = dst.ptr<float>();

    __m128i raw_buffer[3];
    size_t ptidx, ptsize = pts.cols();

    const uchar* ptr00 = img.ptr<uchar>();
    int step = static_cast<int>(img.step / img.elemSize1());
    int r = blockSize / 2;

    float scale = (1 << 2) * blockSize * 255.0f;
    scale = 1.0f / scale;
    float scale_sq_sq = scale * scale * scale * scale;

    __m128i* row_top = &raw_buffer[0];
    __m128i* row_mid = &raw_buffer[1];
    __m128i* row_bot = &raw_buffer[2];
    __m128i* tmp_ptr = nullptr;

    for (ptidx = 0; ptidx < ptsize; ptidx++) {
        int a = 0, b = 0, c = 0;
        int x0 = cvRound(_pts[ptidx].x - r);
        int y0 = cvRound(_pts[ptidx].y - r);
        if (x0 < 1 || x0 + blockSize > img.cols - 2
            || y0 < 1 || y0 + blockSize > img.rows - 2) {
            res[ptidx] = 0.f;
            continue;
        }

        const uchar* ptr0 = ptr00 + y0 * step + x0;
        *row_mid = _mm_loadu_si128((__m128i*)(ptr0 - step - 1)); // top row
        *row_bot = _mm_loadu_si128((__m128i*)(ptr0 - 1)); // middle row

        __m128i va = _mm_setzero_si128();
        __m128i vb = _mm_setzero_si128();
        __m128i vc = _mm_setzero_si128();
        auto accumulateCoeffs = [&va, &vb, &vc, blockSize](const __m128i top_row,
                                    const __m128i mid_row, const __m128i bot_row) {
            __m128i ix, iy;

            __m128i row_top_lo, row_top_hi;
            __m128i row_bot_lo, row_bot_hi;
            row_top_lo = _mm_unpacklo_epi8(top_row, _mm_setzero_si128());
            row_top_hi = _mm_unpackhi_epi8(top_row, _mm_setzero_si128());
            row_bot_lo = _mm_unpacklo_epi8(bot_row, _mm_setzero_si128());
            row_bot_hi = _mm_unpackhi_epi8(bot_row, _mm_setzero_si128());

            __m128i diff0 = _mm_sub_epi16(row_bot_lo, row_top_lo);
            __m128i diff_hi = _mm_sub_epi16(row_bot_hi, row_top_hi);
            __m128i diff1 = _mm_slli_epi16(_mm_alignr_epi8(diff_hi, diff0, 2), 1);
            __m128i diff2 = _mm_alignr_epi8(diff_hi, diff0, 4);

            iy = _mm_add_epi16(_mm_add_epi16(diff0, diff2), diff1);

            diff0 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(top_row, 2)), row_top_lo);
            diff1 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(mid_row, 2)),
                _mm_unpacklo_epi8(mid_row, _mm_setzero_si128()));
            diff1 = _mm_slli_epi16(diff1, 1);
            diff2 = _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(bot_row, 2)), row_bot_lo);

            ix = _mm_add_epi16(diff1, _mm_add_epi16(diff0, diff2));
            // set the last lane to zero
            if (blockSize == 7) {
                __m128i mask = _mm_srli_si128(_mm_set1_epi8(0xFF), 2);
                ix = _mm_and_si128(ix, mask);
                iy = _mm_and_si128(iy, mask);
            }

            __m128i ix_lo, ix_hi;
            __m128i iy_lo, iy_hi;
            ix_lo = _mm_cvtepi16_epi32(ix);
            ix_hi = _mm_cvtepi16_epi32(_mm_srli_si128(ix, 8));
            iy_lo = _mm_cvtepi16_epi32(iy);
            iy_hi = _mm_cvtepi16_epi32(_mm_srli_si128(iy, 8));
            va = _mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(ix_lo, ix_lo), _mm_mullo_epi32(ix_hi, ix_hi)), va);
            vb = _mm_add_epi32(vb, _mm_add_epi32(_mm_mullo_epi32(iy_lo, iy_lo), _mm_mullo_epi32(iy_hi, iy_hi)));
            vc = _mm_add_epi32(vc, _mm_add_epi32(_mm_mullo_epi32(ix_lo, iy_lo), _mm_mullo_epi32(ix_hi, iy_hi)));
        }; //- accumulateCoeffs

        for (int _k = 0; _k < blockSize; _k++, ptr0 += step) {
            // shifting
            tmp_ptr = row_top;
            row_top = row_mid;
            row_mid = row_bot;
            row_bot = tmp_ptr;
            *row_bot = _mm_loadu_si128((__m128i*)(ptr0 + step - 1)); // bottom row
            accumulateCoeffs(*row_top, *row_mid, *row_bot);
            if (blockSize == 9) {
                const uchar* rtop_ptr = reinterpret_cast<const uchar*>(row_top);
                const uchar* rmid_ptr = reinterpret_cast<const uchar*>(row_mid);
                const uchar* rbot_ptr = reinterpret_cast<const uchar*>(row_bot);
                int ix = (rmid_ptr[10] - rmid_ptr[8]) * 2 + (rtop_ptr[10] - rtop_ptr[8]) + (rbot_ptr[10] - rbot_ptr[8]);
                int iy = (rbot_ptr[9] - rtop_ptr[9]) * 2 + (rbot_ptr[8] - rtop_ptr[8]) + (rbot_ptr[10] - rtop_ptr[10]);
                a += ix * ix;
                b += iy * iy;
                c += ix * iy;
            }
        }
        auto hsumEpi32 = [](__m128i x) -> int { // sum up all elements as int32
            __m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
            __m128i sum64 = _mm_add_epi32(hi64, x);
            __m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2));
            __m128i sum32 = _mm_add_epi32(sum64, hi32);
            return _mm_cvtsi128_si32(sum32);
        };
        a += hsumEpi32(va);
        b += hsumEpi32(vb);
        c += hsumEpi32(vc);
        res[ptidx] = scale_sq_sq * (static_cast<float>(a) * b - static_cast<float>(c) * c
                     - k * (static_cast<float>(a) + b) * (static_cast<float>(a) + b));
    }
}

速度测试

测试图像(来自 TUM 数据集):
测试图像
测试代码:

double benchmark(size_t samples, size_t iterations, const function<void()>& op)
{
    using namespace std::chrono;
    auto now = [] { return high_resolution_clock::now(); };

    samples = max(1ul, samples);
    iterations = max(1ul, iterations);
    double best_time = numeric_limits<double>::infinity();
    for (size_t s = 0; s < samples; ++s) {
        auto t1 = now();
        for (size_t i = 0; i < iterations; ++i)
            op();
        best_time = min(best_time, duration<double>(now() - t1).count());
    }
    best_time = best_time / iterations * 1e3;
    return best_time;
}

int main(int argc, char* argv[])
{
    string fn_src = argc > 1 ? argv[1] : "../scene.jpg";
    Mat img_src = imread(fn_src, IMREAD_GRAYSCALE);
    if (img_src.empty()) {
        cout << "cannot load image: " << fn_src << endl;
        return 0;
    }
    cout << "image size: " << img_src.cols << "*" << img_src.rows << endl;

    const int block_size = 7;
    const double k = 0.04;
    const int boundary = (block_size / 2) + 2;
    vector<Point2f> pts;
    for (int y = boundary; y < img_src.rows - boundary; ++y)
        for (int x = boundary; x < img_src.cols - boundary; ++x)
            pts.emplace_back(x, y);
    cout << "num of pts: " << pts.size() << endl;

    // benchmark
    vector<float> result_normal, result_sse;
    double time_normal = benchmark(3, 10, [&] {
        HarrisResponses(img_src, pts, result_normal, block_size, k);
    });
    cout << "time normal: " << time_normal << " ms" << endl;
    double time_sse = benchmark(3, 10, [&] {
        HarrisResponsesSSE(img_src, pts, result_sse, block_size, k);
    });
    cout << "time SSE: " << time_sse << endl;
    cout << "speed up: " << time_normal / time_sse << " ms" << endl;

    // compare results
    for (size_t i = 0; i < pts.size(); ++i) {
        float err = abs(result_normal[i] - result_sse[i]);
        if (err > 1e6) {
            auto pt = pts[i];
            cout << format("error too large: pt[%f, %f] err[%f]", pt.x, pt.y, err) << endl;
            break;
        }
    }

    return 0;
}

在本人 i7-8700 CPU 上测试结果为:

image size: 640*480
num of pts: 296100
time normal: 41.9455 ms
time SSE: 14.3351
speed up: 2.92607 ms

提速约为 3 倍。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值