OpenCV 中的 HOGDescriptor

图波列夫

已于 2023-08-12 09:54:32 修改

阅读量2.4k

点赞数 3

分类专栏： opencv VisualTracking ObjectDetection 文章标签： OpenCV HOG

于 2019-05-27 10:06:13 首次发布

本文链接：https://blog.csdn.net/yiran103/article/details/90577182

版权

ObjectDetection 同时被 3 个专栏收录

9 篇文章

订阅专栏

VisualTracking

8 篇文章

订阅专栏

opencv

7 篇文章

订阅专栏

HOG 特征是深度学习兴起之前非常重要的一种特征。在早期，OpenCV 中的跟踪算法并不支持 HOG 特征，而检测模块中 HOGDescriptor 是一维的。后来，TrackerCSRT 的加入带来了 HOG 特征。然而，Object Detection 和 Deformable Part-based Models 共存令人十分困惑。二者的区别是什么？两个模块均包含 HOG 特征且 TrackerCSRT 与 Deformable Part-based Models 中的一样。经过阅读代码发现，Object Detection 中的 HOGDescriptor 是原版36维 HOG，而 Deformable Part-based Models 的是31维 FHOG。

HOGDescriptor 与 INRIA Object Detection and Localization Toolkit 中的相同。然而原始链接已经失效了，似乎可以参考 Joelgranados/Learcode。Object Detection 中提供的示例为 peopledetect.cpp，但该程序最终调用的是 HOGDescriptor::detect，而非 HOGDescriptor::compute。

HOGDescriptor 的优势为内部的 HOGCache 结构减少冗余计算，然而实现代码非常长，不利于阅读理解。测试代码 TEST(Objdetect_HOGDetector_Strict, accuracy) 是一个不错的起点，HOGDescriptorTester::compute 是简化版的 HOGDescriptor::compute。

HOGDescriptor 的计算过程分为5步：

计算每个像素的图像梯度 $G_x(x, y)$ 和 $G_y(x, y)$ ；
计算梯度幅值 $M (x, y)$ 和方向 $\theta(x, y)$ ；
将梯度幅度的加权投票累积到空间单元上的 $N$ 个方向区间；
在重叠的方块内标准化单元上的对比度；
收集检测窗口上所有重叠方块的直方图矢量。

HOGDescriptor::compute

gcd 求最大公約數。
alignSize() 将缓冲区大小与指定的字节数对齐。

    CV_INSTRUMENT_REGION();

    if( winStride == Size() )
        winStride = cellSize;
    Size cacheStride(gcd(winStride.width, blockStride.width),
                     gcd(winStride.height, blockStride.height));

    Size imgSize = _img.size();

    size_t nwindows = locations.size();
    padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
    padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
    Size paddedImgSize(imgSize.width + padding.width*2, imgSize.height + padding.height*2);

CV_OCL_RUN 会判断是否激活了 OpenCL。
在定义宏HAVE_OPENCL时才定义 ocl_compute 函数。

    CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && _img.isUMat(),
        ocl_compute(_img, winStride, descriptors, HOGDescriptor::DESCR_FORMAT_COL_BY_COL, blockSize,
        cellSize, nbins, blockStride, winSize, (float)getWinSigma(), gammaCorrection, L2HysThreshold, signedGradient))

构造一个 HOGCache 对象。HOGCache 计算了整张图像的梯度。
HOGCache::windowsInImage 根据图像大小和窗口步长计算窗口大小。
HOGDescriptor::getDescriptorSize() 根据方向量化数以及窗口、方块和单元的大小计算特征描述符长度。

    Mat img = _img.getMat();
    HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);

    if( !nwindows )
        nwindows = cache.windowsInImage(paddedImgSize, winStride).area();

    const HOGCache::BlockData* blockData = &cache.blockData[0];

    int nblocks = cache.nblocks.area();
    int blockHistogramSize = cache.blockHistogramSize;
    size_t dsize = getDescriptorSize();
    descriptors.resize(dsize*nwindows);

对于每个窗口，拷贝cache中的数据到descriptors中。
如果locations为空则调用 HOGCache::getWindow 根据图像大小和索引计算矩形窗口。
对于每个窗口的每个方块调用 HOGCache::getBlock，最终得到特征描述符descriptors。

    // for each window
    for( size_t i = 0; i < nwindows; i++ )
    {
        float* descriptor = &descriptors[i*dsize];

        Point pt0;
        if( !locations.empty() )
        {
            pt0 = locations[i];
            if( pt0.x < -padding.width || pt0.x > img.cols + padding.width - winSize.width ||
                pt0.y < -padding.height || pt0.y > img.rows + padding.height - winSize.height )
                continue;
        }
        else
        {
            pt0 = cache.getWindow(paddedImgSize, winStride, (int)i).tl() - Point(padding);
//            CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0);
        }

        for( int j = 0; j < nblocks; j++ )
        {
            const HOGCache::BlockData& bj = blockData[j];
            Point pt = pt0 + bj.imgOffset;

            float* dst = descriptor + bj.histOfs;
            const float* src = cache.getBlock(pt, dst);
            if( src != dst )
                memcpy(dst, src, blockHistogramSize * sizeof(float));
        }
    }

HOGCache

内部定义BlockData 和 PixData 结构体。

    struct BlockData
    {
        BlockData() :
            histOfs(0), imgOffset()
        { }

        int histOfs;
        Point imgOffset;
    };

    struct PixData
    {
        size_t gradOfs, qangleOfs;
        int histOfs[4];
        float histWeights[4];
        float gradWeight;
    };

    HOGCache();
    HOGCache(const HOGDescriptor* descriptor,
        const Mat& img, const Size& paddingTL, const Size& paddingBR,
        bool useCache, const Size& cacheStride);
    virtual ~HOGCache() { }
    virtual void init(const HOGDescriptor* descriptor,
        const Mat& img, const Size& paddingTL, const Size& paddingBR,
        bool useCache, const Size& cacheStride);

    Size windowsInImage(const Size& imageSize, const Size& winStride) const;
    Rect getWindow(const Size& imageSize, const Size& winStride, int idx) const;

    const float* getBlock(Point pt, float* buf);
    virtual void normalizeBlockHistogram(float* histogram) const;

    std::vector<PixData> pixData;
    std::vector<BlockData> blockData;

    bool useCache;
    std::vector<int> ymaxCached;
    Size winSize;
    Size cacheStride;
    Size nblocks, ncells;
    int blockHistogramSize;
    int count1, count2, count4;
    Point imgoffset;
    Mat_<float> blockCache;
    Mat_<uchar> blockCacheFlags;

    Mat grad, qangle;
    const HOGDescriptor* descriptor;

HOGCache::HOGCache

    init(_descriptor, _img, _paddingTL, _paddingBR, _useCache, _cacheStride);

HOGCache::init

HOGDescriptor::computeGradient 由图像计算梯度的幅值和量化角度。

    descriptor = _descriptor;
    cacheStride = _cacheStride;
    useCache = _useCache;

    descriptor->computeGradient(_img, grad, qangle, _paddingTL, _paddingBR);
    imgoffset = _paddingTL;

由描述符定义的尺寸计算方块的数量nblocks、方块内单元数量ncells。

    winSize = descriptor->winSize;
    Size blockSize = descriptor->blockSize;
    Size blockStride = descriptor->blockStride;
    Size cellSize = descriptor->cellSize;
    int i, j, nbins = descriptor->nbins;
    int rawBlockSize = blockSize.width*blockSize.height;

    nblocks = Size((winSize.width - blockSize.width)/blockStride.width + 1,
        (winSize.height - blockSize.height)/blockStride.height + 1);
    ncells = Size(blockSize.width/cellSize.width, blockSize.height/cellSize.height);
    blockHistogramSize = ncells.width*ncells.height*nbins;

如果使用缓存，创建blockCache和blockCacheFlags。
由grad的列数和窗口高度计算cacheSize，意味着blockCache会缓存窗口滑动一行所覆盖的方块特征。
ymaxCached用于记录已缓存的行。在 HOGCache::getBlock 赋值，与blockCacheFlags搭配使用。

    if( useCache )
    {
        Size cacheSize((grad.cols - blockSize.width)/cacheStride.width+1,
            (winSize.height/cacheStride.height)+1);

        blockCache.create(cacheSize.height, cacheSize.width*blockHistogramSize);
        blockCacheFlags.create(cacheSize);

        size_t cacheRows = blockCache.rows;
        ymaxCached.resize(cacheRows);
        for(size_t ii = 0; ii < cacheRows; ii++ )
            ymaxCached[ii] = -1;
    }

计算方块内的加窗权重。bh和bw是块的中心位置。_di和_dj是距离的平方。weights是窗函数。

    Mat_<float> weights(blockSize);
    float sigma = (float)descriptor->getWinSigma();
    float scale = 1.f/(sigma*sigma*2);

    {
        AutoBuffer<float> di(blockSize.height), dj(blockSize.width);
        float* _di = di.data(), *_dj = dj.data();
        float bh = blockSize.height * 0.5f, bw = blockSize.width * 0.5f;

        i = 0;
    #if CV_SSE2
        const int a[] = { 0, 1, 2, 3 };
        __m128i idx = _mm_loadu_si128((__m128i*)a);
        __m128 _bw = _mm_set1_ps(bw), _bh = _mm_set1_ps(bh);
        __m128i ifour = _mm_set1_epi32(4);

        for (; i <= blockSize.height - 4; i += 4)
        {
            __m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bh);
            t = _mm_mul_ps(t, t);
            idx = _mm_add_epi32(idx, ifour);
            _mm_storeu_ps(_di + i, t);
        }
    #elif CV_NEON
        const int a[] = { 0, 1, 2, 3 };
        int32x4_t idx = vld1q_s32(a);
        float32x4_t _bw = vdupq_n_f32(bw), _bh = vdupq_n_f32(bh);
        int32x4_t ifour = vdupq_n_s32(4);

        for (; i <= blockSize.height - 4; i += 4)
        {
            float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bh);
            t = vmulq_f32(t, t);
            idx = vaddq_s32(idx, ifour);
            vst1q_f32(_di + i, t);
        }
    #endif
        for ( ; i < blockSize.height; ++i)
        {
            _di[i] = i - bh;
            _di[i] *= _di[i];
        }

        j = 0;
    #if CV_SSE2
        idx = _mm_loadu_si128((__m128i*)a);
        for (; j <= blockSize.width - 4; j += 4)
        {
            __m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bw);
            t = _mm_mul_ps(t, t);
            idx = _mm_add_epi32(idx, ifour);
            _mm_storeu_ps(_dj + j, t);
        }
    #elif CV_NEON
        idx = vld1q_s32(a);
        for (; j <= blockSize.width - 4; j += 4)
        {
            float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bw);
            t = vmulq_f32(t, t);
            idx = vaddq_s32(idx, ifour);
            vst1q_f32(_dj + j, t);
        }
    #endif
        for ( ; j < blockSize.width; ++j)
        {
            _dj[j] = j - bw;
            _dj[j] *= _dj[j];
        }

        for(i = 0; i < blockSize.height; i++)
            for(j = 0; j < blockSize.width; j++)
                weights(i,j) = std::exp(-(_di[i] + _dj[j])*scale);
    }

初始化pixData和blockData两个查找表。blockData为窗口中的方块数组，pixData为方块对应的像素数组，首先申请为方块元素的3倍大小。
由于单元格之间没有重叠，因此方块直接处理每个像素。

    blockData.resize(nblocks.width*nblocks.height);
    pixData.resize(rawBlockSize*3);

    // Initialize 2 lookup tables, pixData & blockData.
    // Here is why:
    //
    // The detection algorithm runs in 4 nested loops (at each pyramid layer):
    //  loop over the windows within the input image
    //    loop over the blocks within each window
    //      loop over the cells within each block
    //        loop over the pixels in each cell
    //
    // As each of the loops runs over a 2-dimensional array,
    // we could get 8(!) nested loops in total, which is very-very slow.
    //
    // To speed the things up, we do the following:
    //   1. loop over windows is unrolled in the HOGDescriptor::{compute|detect} methods;
    //         inside we compute the current search window using getWindow() method.
    //         Yes, it involves some overhead (function call + couple of divisions),
    //         but it's tiny in fact.
    //   2. loop over the blocks is also unrolled. Inside we use pre-computed blockData[j]
    //         to set up gradient and histogram pointers.
    //   3. loops over cells and pixels in each cell are merged
    //       (since there is no overlap between cells, each pixel in the block is processed once)
    //      and also unrolled. Inside we use PixData[k] to access the gradient values and
    //      update the histogram
    //

三线性插值开销较大，因此预计算方块中每个元素对应的空间权重。
histWeights为插值权重。histOfs记录元素的位置。
count1、count2和count4分别表示方块直方图所统计的元素插值数量。由于count1、count2和count4在循环的不同分支里，所以count1 + count2 + count4 = rawBlockSize。
这里的详细内容可以参考 Fast Calculation of Histogram of Oriented Gradient Feature by Removing Redundancy in Overlapping Block。

    count1 = count2 = count4 = 0;
    for( j = 0; j < blockSize.width; j++ )
        for( i = 0; i < blockSize.height; i++ )
        {
            PixData* data = 0;
            float cellX = (j+0.5f)/cellSize.width - 0.5f;
            float cellY = (i+0.5f)/cellSize.height - 0.5f;
            int icellX0 = cvFloor(cellX);
            int icellY0 = cvFloor(cellY);
            int icellX1 = icellX0 + 1, icellY1 = icellY0 + 1;
            cellX -= icellX0;
            cellY -= icellY0;

如果像素对应的4个单元格均未超出方块，则在pixData中记录4个单元格的直方图偏移和权重；否则记录2个或者1个。

            if( (unsigned)icellX0 < (unsigned)ncells.width &&
               (unsigned)icellX1 < (unsigned)ncells.width )
            {
                if( (unsigned)icellY0 < (unsigned)ncells.height &&
                   (unsigned)icellY1 < (unsigned)ncells.height )
                {
                    data = &pixData[rawBlockSize*2 + (count4++)];
                    data->histOfs[0] = (icellX0*ncells.height + icellY0)*nbins;
                    data->histWeights[0] = (1.f - cellX)*(1.f - cellY);
                    data->histOfs[1] = (icellX1*ncells.height + icellY0)*nbins;
                    data->histWeights[1] = cellX*(1.f - cellY);
                    data->histOfs[2] = (icellX0*ncells.height + icellY1)*nbins;
                    data->histWeights[2] = (1.f - cellX)*cellY;
                    data->histOfs[3] = (icellX1*ncells.height + icellY1)*nbins;
                    data->histWeights[3] = cellX*cellY;
                }
                else
                {
                    data = &pixData[rawBlockSize + (count2++)];
                    if( (unsigned)icellY0 < (unsigned)ncells.height )
                    {
                        icellY1 = icellY0;
                        cellY = 1.f - cellY;
                    }
                    data->histOfs[0] = (icellX0*ncells.height + icellY1)*nbins;
                    data->histWeights[0] = (1.f - cellX)*cellY;
                    data->histOfs[1] = (icellX1*ncells.height + icellY1)*nbins;
                    data->histWeights[1] = cellX*cellY;
                    data->histOfs[2] = data->histOfs[3] = 0;
                    data->histWeights[2] = data->histWeights[3] = 0;
                }
            }
            else
            {
                if( (unsigned)icellX0 < (unsigned)ncells.width )
                {
                    icellX1 = icellX0;
                    cellX = 1.f - cellX;
                }

                if( (unsigned)icellY0 < (unsigned)ncells.height &&
                   (unsigned)icellY1 < (unsigned)ncells.height )
                {
                    data = &pixData[rawBlockSize + (count2++)];
                    data->histOfs[0] = (icellX1*ncells.height + icellY0)*nbins;
                    data->histWeights[0] = cellX*(1.f - cellY);
                    data->histOfs[1] = (icellX1*ncells.height + icellY1)*nbins;
                    data->histWeights[1] = cellX*cellY;
                    data->histOfs[2] = data->histOfs[3] = 0;
                    data->histWeights[2] = data->histWeights[3] = 0;
                }
                else
                {
                    data = &pixData[count1++];
                    if( (unsigned)icellY0 < (unsigned)ncells.height )
                    {
                        icellY1 = icellY0;
                        cellY = 1.f - cellY;
                    }
                    data->histOfs[0] = (icellX1*ncells.height + icellY1)*nbins;
                    data->histWeights[0] = cellX*cellY;
                    data->histOfs[1] = data->histOfs[2] = data->histOfs[3] = 0;
                    data->histWeights[1] = data->histWeights[2] = data->histWeights[3] = 0;
                }
            }
            data->gradOfs = (grad.cols*i + j)*2;
            data->qangleOfs = (qangle.cols*i + j)*2;
            data->gradWeight = weights(i,j);
        }

重组pixData。

    assert( count1 + count2 + count4 == rawBlockSize );
    // defragment pixData
    for( j = 0; j < count2; j++ )
        pixData[j + count1] = pixData[j + rawBlockSize];
    for( j = 0; j < count4; j++ )
        pixData[j + count1 + count2] = pixData[j + rawBlockSize*2];
    count2 += count1;
    count4 += count2;

初始化blockData，记录每个方块特征的偏移以及对应图像上的偏移。

    // initialize blockData
    for( j = 0; j < nblocks.width; j++ )
        for( i = 0; i < nblocks.height; i++ )
        {
            BlockData& data = blockData[j*nblocks.height + i];
            data.histOfs = (j*nblocks.height + i)*blockHistogramSize;
            data.imgOffset = Point(j*blockStride.width,i*blockStride.height);
        }

HOGDescriptor::computeGradient

Mat::locateROI 在父矩阵中找到矩阵头。使用 Mat::row、Mat::col、Mat::rowRange、 Mat::colRange 等从矩阵中提取子矩阵后，生成的子矩阵仅指向原始大矩阵的一部分。然而，每个子矩阵包含有助于重建原始矩阵大小的信息以及提取的子矩阵在原始矩阵内的位置（由datastart和dataend字段表示）。

    CV_INSTRUMENT_REGION();

    CV_Assert( img.type() == CV_8U || img.type() == CV_8UC3 );

    Size gradsize(img.cols + paddingTL.width + paddingBR.width,
        img.rows + paddingTL.height + paddingBR.height);
    grad.create(gradsize, CV_32FC2);  // <magnitude*(1-alpha), magnitude*alpha>
    qangle.create(gradsize, CV_8UC2); // [0..nbins-1] - quantized gradient orientation

    Size wholeSize;
    Point roiofs;
    img.locateROI(wholeSize, roiofs);

    int i, x, y;
    int cn = img.channels();

创建_lut数组。

    Mat_<float> _lut(1, 256);
    const float* const lut = &_lut(0,0);
#if CV_SSE2
    const int indices[] = { 0, 1, 2, 3 };
    __m128i idx = _mm_loadu_si128((const __m128i*)indices);
    __m128i ifour = _mm_set1_epi32(4);

    float* const _data = &_lut(0, 0);
    if( gammaCorrection )
        for( i = 0; i < 256; i += 4 )
        {
            _mm_storeu_ps(_data + i, _mm_sqrt_ps(_mm_cvtepi32_ps(idx)));
            idx = _mm_add_epi32(idx, ifour);
        }
    else
        for( i = 0; i < 256; i += 4 )
        {
            _mm_storeu_ps(_data + i, _mm_cvtepi32_ps(idx));
            idx = _mm_add_epi32(idx, ifour);
        }
#elif CV_NEON
    const int indices[] = { 0, 1, 2, 3 };
    uint32x4_t idx = *(uint32x4_t*)indices;
    uint32x4_t ifour = vdupq_n_u32(4);

    float* const _data = &_lut(0, 0);
    if( gammaCorrection )
        for( i = 0; i < 256; i++ )
            _lut(0,i) = std::sqrt((float)i);
    else
        for( i = 0; i < 256; i += 4 )
        {
            vst1q_f32(_data + i, vcvtq_f32_u32(idx));
            idx = vaddq_u32 (idx, ifour);
        }
#else
    if( gammaCorrection )
        for( i = 0; i < 256; i++ )
            _lut(0,i) = std::sqrt((float)i);
    else
        for( i = 0; i < 256; i++ )
            _lut(0,i) = (float)i;
#endif

AutoBuffer 自动分配的缓冲区类。该类用于函数和方法中的临时缓冲区。如果一个临时缓冲区通常很小（几个K的内存），但它的大小取决于参数，那么在堆栈上创建一个小的固定大小的数组并在它足够大时使用它是有意义的。如果所需的缓冲区大小大于固定大小，则动态分配另一个足够大小的缓冲区，并在处理后释放。因此，在典型情况下，当缓冲区大小很小时，没有与malloc()/free()相关的开销。同时，处理数据的大小没有限制。
_dbuf申请的内存给Dx、Dy、Mag和Angle使用。

borderInterpolate 计算外推像素的源位置。

如果通道数为3，xmap存储位置的步长调整为3。

    AutoBuffer<int> mapbuf(gradsize.width + gradsize.height + 4);
    int* xmap = mapbuf.data() + 1;
    int* ymap = xmap + gradsize.width + 2;

    const int borderType = (int)BORDER_REFLECT_101;

    for( x = -1; x < gradsize.width + 1; x++ )
        xmap[x] = borderInterpolate(x - paddingTL.width + roiofs.x,
        wholeSize.width, borderType) - roiofs.x;
    for( y = -1; y < gradsize.height + 1; y++ )
        ymap[y] = borderInterpolate(y - paddingTL.height + roiofs.y,
        wholeSize.height, borderType) - roiofs.y;

    // x- & y- derivatives for the whole row
    int width = gradsize.width;
    AutoBuffer<float> _dbuf(width*4);
    float* const dbuf = _dbuf.data();
    Mat Dx(1, width, CV_32F, dbuf);
    Mat Dy(1, width, CV_32F, dbuf + width);
    Mat Mag(1, width, CV_32F, dbuf + width*2);
    Mat Angle(1, width, CV_32F, dbuf + width*3);

    if (cn == 3)
    {
        int end = gradsize.width + 2;
        xmap -= 1, x = 0;
#if CV_SSE2
        for ( ; x <= end - 4; x += 4)
        {
            __m128i mul_res = _mm_loadu_si128((const __m128i*)(xmap + x));
            mul_res = _mm_add_epi32(_mm_add_epi32(mul_res, mul_res), mul_res); // multiply by 3
            _mm_storeu_si128((__m128i*)(xmap + x), mul_res);
        }
#elif CV_NEON
        int32x4_t ithree = vdupq_n_s32(3);
        for ( ; x <= end - 4; x += 4)
            vst1q_s32(xmap + x, vmulq_s32(ithree, vld1q_s32(xmap + x)));
#endif
        for ( ; x < end; ++x)
            xmap[x] *= 3;
        xmap += 1;
    }

循环内处理每一行。如果是单通道直接计算；如果是多通道使用向量指令。

    float angleScale = signedGradient ? (float)(nbins/(2.0*CV_PI)) : (float)(nbins/CV_PI);
    for( y = 0; y < gradsize.height; y++ )
    {
        const uchar* imgPtr  = img.ptr(ymap[y]);
        //In case subimage is used ptr() generates an assert for next and prev rows
        //(see http://code.opencv.org/issues/4149)
        const uchar* prevPtr = img.data + img.step*ymap[y-1];
        const uchar* nextPtr = img.data + img.step*ymap[y+1];

        float* gradPtr = grad.ptr<float>(y);
        uchar* qanglePtr = qangle.ptr(y);

        if( cn == 1 )
        {
            for( x = 0; x < width; x++ )
            {
                int x1 = xmap[x];
                dbuf[x] = (float)(lut[imgPtr[xmap[x+1]]] - lut[imgPtr[xmap[x-1]]]);
                dbuf[width + x] = (float)(lut[nextPtr[x1]] - lut[prevPtr[x1]]);
            }
        }
        else
        {
            x = 0;
#if CV_SSE2
            for( ; x <= width - 4; x += 4 )
            {
                int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
                typedef const uchar* const T;
                T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
                T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
                T p22 = imgPtr + xmap[x+3], p20 = p02;
                T p32 = imgPtr + xmap[x+4], p30 = p12;

                __m128 _dx0 = _mm_sub_ps(_mm_set_ps(lut[p32[0]], lut[p22[0]], lut[p12[0]], lut[p02[0]]),
                                         _mm_set_ps(lut[p30[0]], lut[p20[0]], lut[p10[0]], lut[p00[0]]));
                __m128 _dx1 = _mm_sub_ps(_mm_set_ps(lut[p32[1]], lut[p22[1]], lut[p12[1]], lut[p02[1]]),
                                         _mm_set_ps(lut[p30[1]], lut[p20[1]], lut[p10[1]], lut[p00[1]]));
                __m128 _dx2 = _mm_sub_ps(_mm_set_ps(lut[p32[2]], lut[p22[2]], lut[p12[2]], lut[p02[2]]),
                                         _mm_set_ps(lut[p30[2]], lut[p20[2]], lut[p10[2]], lut[p00[2]]));

                __m128 _dy0 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3]], lut[nextPtr[x2]], lut[nextPtr[x1]], lut[nextPtr[x0]]),
                                         _mm_set_ps(lut[prevPtr[x3]], lut[prevPtr[x2]], lut[prevPtr[x1]], lut[prevPtr[x0]]));
                __m128 _dy1 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+1]], lut[nextPtr[x2+1]], lut[nextPtr[x1+1]], lut[nextPtr[x0+1]]),
                                         _mm_set_ps(lut[prevPtr[x3+1]], lut[prevPtr[x2+1]], lut[prevPtr[x1+1]], lut[prevPtr[x0+1]]));
                __m128 _dy2 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+2]], lut[nextPtr[x2+2]], lut[nextPtr[x1+2]], lut[nextPtr[x0+2]]),
                                         _mm_set_ps(lut[prevPtr[x3+2]], lut[prevPtr[x2+2]], lut[prevPtr[x1+2]], lut[prevPtr[x0+2]]));

                __m128 _mag0 = _mm_add_ps(_mm_mul_ps(_dx0, _dx0), _mm_mul_ps(_dy0, _dy0));
                __m128 _mag1 = _mm_add_ps(_mm_mul_ps(_dx1, _dx1), _mm_mul_ps(_dy1, _dy1));
                __m128 _mag2 = _mm_add_ps(_mm_mul_ps(_dx2, _dx2), _mm_mul_ps(_dy2, _dy2));

                __m128 mask = _mm_cmpgt_ps(_mag2, _mag1);
                _dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx1));
                _dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy1));

                mask = _mm_cmpgt_ps(_mm_max_ps(_mag2, _mag1), _mag0);
                _dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx0));
                _dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy0));

                _mm_storeu_ps(dbuf + x, _dx2);
                _mm_storeu_ps(dbuf + x + width, _dy2);
            }
#elif CV_NEON
            for( ; x <= width - 4; x += 4 )
            {
                int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
                typedef const uchar* const T;
                T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
                T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
                T p22 = imgPtr + xmap[x+3], p20 = p02;
                T p32 = imgPtr + xmap[x+4], p30 = p12;

                float32x4_t _dx0 = vsubq_f32(vsetq_f32(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]),
                                             vsetq_f32(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]));
                float32x4_t _dx1 = vsubq_f32(vsetq_f32(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]),
                                             vsetq_f32(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]));
                float32x4_t _dx2 = vsubq_f32(vsetq_f32(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]),
                                             vsetq_f32(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]));

                float32x4_t _dy0 = vsubq_f32(vsetq_f32(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]),
                                             vsetq_f32(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]));
                float32x4_t _dy1 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]),
                                             vsetq_f32(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]));
                float32x4_t _dy2 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]),
                                             vsetq_f32(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]));

                float32x4_t _mag0 = vaddq_f32(vmulq_f32(_dx0, _dx0), vmulq_f32(_dy0, _dy0));
                float32x4_t _mag1 = vaddq_f32(vmulq_f32(_dx1, _dx1), vmulq_f32(_dy1, _dy1));
                float32x4_t _mag2 = vaddq_f32(vmulq_f32(_dx2, _dx2), vmulq_f32(_dy2, _dy2));

                uint32x4_t mask = vcgtq_f32(_mag2, _mag1);
                _dx2 = vbslq_f32(mask, _dx2, _dx1);
                _dy2 = vbslq_f32(mask, _dy2, _dy1);

                mask = vcgtq_f32(vmaxq_f32(_mag2, _mag1), _mag0);
                _dx2 = vbslq_f32(mask, _dx2, _dx0);
                _dy2 = vbslq_f32(mask, _dy2, _dy0);

                vst1q_f32(dbuf + x, _dx2);
                vst1q_f32(dbuf + x + width, _dy2);
            }
#endif

$\begin{aligned} G_y(x, y) &= \frac{I(x+1, y) -I(x-1, y)}{2}\\ G_y(x, y) &= \frac{I(x, y+1) -I(x, y-1)}{2} \end{aligned}$

            for( ; x < width; x++ )
            {
                int x1 = xmap[x];
                float dx0, dy0, dx, dy, mag0, mag;
                const uchar* p2 = imgPtr + xmap[x+1];
                const uchar* p0 = imgPtr + xmap[x-1];

                dx0 = lut[p2[2]] - lut[p0[2]];
                dy0 = lut[nextPtr[x1+2]] - lut[prevPtr[x1+2]];
                mag0 = dx0*dx0 + dy0*dy0;

                dx = lut[p2[1]] - lut[p0[1]];
                dy = lut[nextPtr[x1+1]] - lut[prevPtr[x1+1]];
                mag = dx*dx + dy*dy;
                if( mag0 < mag )
                {
                    dx0 = dx;
                    dy0 = dy;
                    mag0 = mag;
                }

                dx = lut[p2[0]] - lut[p0[0]];
                dy = lut[nextPtr[x1]] - lut[prevPtr[x1]];
                mag = dx*dx + dy*dy;
                if( mag0 < mag )
                {
                    dx0 = dx;
                    dy0 = dy;
                    mag0 = mag;
                }

                dbuf[x] = dx0;
                dbuf[x+width] = dy0;
            }
        }

cartToPolar 计算2D矢量的大小和角度。

函数 cv::cartToPolar 计算每个2D 向量(x(I),y(I))的幅度、角度或两者：
$\begin{array}{l} \mathrm{magnitude} (I)= \sqrt{\mathrm{x}(I)^2+\mathrm{y}(I)^2} , \\ \mathrm{angle} (I)= \mathrm{atan2} ( \mathrm{y} (I), \mathrm{x} (I))[ \cdot180 / \pi ] \end{array}$

        // computing angles and magnidutes
        cartToPolar( Dx, Dy, Mag, Angle, false );

只有CV_SSE2而没有CV_NEON。由于前一循环会改变x的值，所以能够自动跳过后面的循环。

        // filling the result matrix
        x = 0;
#if CV_SSE2
        __m128 fhalf = _mm_set1_ps(0.5f), fzero = _mm_setzero_ps();
        __m128 _angleScale = _mm_set1_ps(angleScale), fone = _mm_set1_ps(1.0f);
        __m128i ione = _mm_set1_epi32(1), _nbins = _mm_set1_epi32(nbins), izero = _mm_setzero_si128();

        for ( ; x <= width - 4; x += 4)
        {
            int x2 = x << 1;
            __m128 _mag = _mm_loadu_ps(dbuf + x + (width << 1));
            __m128 _angle = _mm_loadu_ps(dbuf + x + width * 3);
            _angle = _mm_sub_ps(_mm_mul_ps(_angleScale, _angle), fhalf);

            __m128 sign = _mm_and_ps(fone, _mm_cmplt_ps(_angle, fzero));
            __m128i _hidx = _mm_cvttps_epi32(_angle);
            _hidx = _mm_sub_epi32(_hidx, _mm_cvtps_epi32(sign));
            _angle = _mm_sub_ps(_angle, _mm_cvtepi32_ps(_hidx));

            __m128 ft0 = _mm_mul_ps(_mag, _mm_sub_ps(fone, _angle));
            __m128 ft1 = _mm_mul_ps(_mag, _angle);
            __m128 ft2 = _mm_unpacklo_ps(ft0, ft1);
            __m128 ft3 = _mm_unpackhi_ps(ft0, ft1);

            _mm_storeu_ps(gradPtr + x2, ft2);
            _mm_storeu_ps(gradPtr + x2 + 4, ft3);

            __m128i mask0 = _mm_sub_epi32(izero, _mm_srli_epi32(_hidx, 31));
            __m128i it0 = _mm_and_si128(mask0, _nbins);
            mask0 = _mm_cmplt_epi32(_hidx, _nbins);
            __m128i it1 = _mm_andnot_si128(mask0, _nbins);
            _hidx = _mm_add_epi32(_hidx, _mm_sub_epi32(it0, it1));

            it0 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
            _hidx = _mm_add_epi32(ione, _hidx);
            _hidx = _mm_and_si128(_hidx, _mm_cmplt_epi32(_hidx, _nbins));
            it1 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
            it0 = _mm_unpacklo_epi8(it0, it1);

            _mm_storel_epi64((__m128i*)(qanglePtr + x2), it0);
        }
#endif

cvFloor() 将浮点数舍入为最接近的不大于原始值的整数。
Angle中元素乘以angleScale后得到其级数。
由角度对左右两个幅度产生贡献。

        for( ; x < width; x++ )
        {
            float mag = dbuf[x+width*2], angle = dbuf[x+width*3]*angleScale - 0.5f;
            int hidx = cvFloor(angle);
            angle -= hidx;
            gradPtr[x*2] = mag*(1.f - angle);
            gradPtr[x*2+1] = mag*angle;

            if( hidx < 0 )
                hidx += nbins;
            else if( hidx >= nbins )
                hidx -= nbins;

            CV_Assert( (unsigned)hidx < (unsigned)nbins );

            qanglePtr[x*2] = (uchar)hidx;
            hidx++;
            hidx &= hidx < nbins ? -1 : 0;
            qanglePtr[x*2+1] = (uchar)hidx;
        }
    }

HOGCache::getBlock

imgoffset由于填充造成。

    float* blockHist = buf;
    assert(descriptor != 0);

//    Size blockSize = descriptor->blockSize;
    pt += imgoffset;

//    CV_Assert( (unsigned)pt.x <= (unsigned)(grad.cols - blockSize.width) &&
//        (unsigned)pt.y <= (unsigned)(grad.rows - blockSize.height) );

    if( useCache )
    {
        CV_Assert( pt.x % cacheStride.width == 0 &&
                   pt.y % cacheStride.height == 0 );
        Point cacheIdx(pt.x/cacheStride.width,
                       (pt.y/cacheStride.height) % blockCache.rows);
        if( pt.y != ymaxCached[cacheIdx.y] )
        {
            Mat_<uchar> cacheRow = blockCacheFlags.row(cacheIdx.y);
            cacheRow = (uchar)0;
            ymaxCached[cacheIdx.y] = pt.y;
        }

        blockHist = &blockCache[cacheIdx.y][cacheIdx.x*blockHistogramSize];
        uchar& computedFlag = blockCacheFlags(cacheIdx.y, cacheIdx.x);
        if( computedFlag != 0 )
            return blockHist;
        computedFlag = (uchar)1; // set it at once, before actual computing
    }

count1、count2和count4在 HOGCache::init 中计算。

    int k, C1 = count1, C2 = count2, C4 = count4;
    const float* gradPtr = grad.ptr<float>(pt.y) + pt.x*2;
    const uchar* qanglePtr = qangle.ptr(pt.y) + pt.x*2;

//    CV_Assert( blockHist != 0 );
    memset(blockHist, 0, sizeof(float) * blockHistogramSize);

    const PixData* _pixData = &pixData[0];

    for( k = 0; k < C1; k++ )
    {
        const PixData& pk = _pixData[k];
        const float* const a = gradPtr + pk.gradOfs;
        float w = pk.gradWeight*pk.histWeights[0];
        const uchar* h = qanglePtr + pk.qangleOfs;
        int h0 = h[0], h1 = h[1];

        float* hist = blockHist + pk.histOfs[0];
        float t0 = hist[h0] + a[0]*w;
        float t1 = hist[h1] + a[1]*w;
        hist[h0] = t0; hist[h1] = t1;
    }

#if CV_SSE2
    float hist0[4], hist1[4];
    for( ; k < C2; k++ )
    {
        const PixData& pk = _pixData[k];
        const float* const a = gradPtr + pk.gradOfs;
        const uchar* const h = qanglePtr + pk.qangleOfs;
        int h0 = h[0], h1 = h[1];

        __m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
        __m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
        __m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);

        _mm_storeu_ps(hist0, _t0);
        _mm_storeu_ps(hist1, _t1);

        float* hist = blockHist + pk.histOfs[0];
        float t0 = hist[h0] + hist0[0];
        float t1 = hist[h1] + hist1[0];
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[1];
        t0 = hist[h0] + hist0[1];
        t1 = hist[h1] + hist1[1];
        hist[h0] = t0; hist[h1] = t1;
    }
#elif CV_NEON
    float hist0[4], hist1[4];
    for( ; k < C2; k++ )
    {
        const PixData& pk = _pixData[k];
        const float* const a = gradPtr + pk.gradOfs;
        const uchar* const h = qanglePtr + pk.qangleOfs;
        int h0 = h[0], h1 = h[1];

        float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
        float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));

        float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0], (blockHist + pk.histOfs[1])[h0], 0,  0);
        float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1], (blockHist + pk.histOfs[1])[h1], 0,  0);

        float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
        vst1q_f32(hist0, _t0);
        vst1q_f32(hist1, _t1);

        (blockHist + pk.histOfs[0])[h0] = hist0[0];
        (blockHist + pk.histOfs[1])[h0] = hist0[1];

        (blockHist + pk.histOfs[0])[h1] = hist1[0];
        (blockHist + pk.histOfs[1])[h1] = hist1[1];
    }
#else
    for( ; k < C2; k++ )
    {
        const PixData& pk = _pixData[k];
        const float* const a = gradPtr + pk.gradOfs;
        float w, t0, t1, a0 = a[0], a1 = a[1];
        const uchar* const h = qanglePtr + pk.qangleOfs;
        int h0 = h[0], h1 = h[1];

        float* hist = blockHist + pk.histOfs[0];
        w = pk.gradWeight*pk.histWeights[0];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[1];
        w = pk.gradWeight*pk.histWeights[1];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;
    }
#endif

#if CV_SSE2
    for( ; k < C4; k++ )
    {
        const PixData& pk = _pixData[k];
        const float* const a = gradPtr + pk.gradOfs;
        const uchar* const h = qanglePtr + pk.qangleOfs;
        int h0 = h[0], h1 = h[1];

        __m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
        __m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
        __m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);

        _mm_storeu_ps(hist0, _t0);
        _mm_storeu_ps(hist1, _t1);

        float* hist = blockHist + pk.histOfs[0];
        float t0 = hist[h0] + hist0[0];
        float t1 = hist[h1] + hist1[0];
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[1];
        t0 = hist[h0] + hist0[1];
        t1 = hist[h1] + hist1[1];
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[2];
        t0 = hist[h0] + hist0[2];
        t1 = hist[h1] + hist1[2];
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[3];
        t0 = hist[h0] + hist0[3];
        t1 = hist[h1] + hist1[3];
        hist[h0] = t0; hist[h1] = t1;

//        __m128 _hist0 = _mm_set_ps((blockHist + pk.histOfs[3])[h0], (blockHist + pk.histOfs[2])[h0],
//            (blockHist + pk.histOfs[1])[h0], (blockHist + pk.histOfs[0])[h0]);
//        __m128 _hist1 = _mm_set_ps((blockHist + pk.histOfs[3])[h1], (blockHist + pk.histOfs[2])[h1],
//            (blockHist + pk.histOfs[1])[h1], (blockHist + pk.histOfs[0])[h1]);
//
//        _hist0 = _mm_add_ps(_t0, _hist0);
//        _hist1 = _mm_add_ps(_t1, _hist1);
//
//        _mm_storeu_ps(hist0, _hist0);
//        _mm_storeu_ps(hist1, _hist1);
//
//        (pk.histOfs[0] + blockHist)[h0] = hist0[0];
//        (pk.histOfs[1] + blockHist)[h0] = hist0[1];
//        (pk.histOfs[2] + blockHist)[h0] = hist0[2];
//        (pk.histOfs[3] + blockHist)[h0] = hist0[3];
//
//        (pk.histOfs[0] + blockHist)[h1] = hist1[0];
//        (pk.histOfs[1] + blockHist)[h1] = hist1[1];
//        (pk.histOfs[2] + blockHist)[h1] = hist1[2];
//        (pk.histOfs[3] + blockHist)[h1] = hist1[3];
    }
#elif CV_NEON
    for( ; k < C4; k++ )
    {
        const PixData& pk = _pixData[k];
        const float* const a = gradPtr + pk.gradOfs;
        const uchar* const h = qanglePtr + pk.qangleOfs;
        int h0 = h[0], h1 = h[1];

        float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
        float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));

        float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0],
                                    (blockHist + pk.histOfs[1])[h0],
                                    (blockHist + pk.histOfs[2])[h0],
                                    (blockHist + pk.histOfs[3])[h0]);
        float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1],
                                    (blockHist + pk.histOfs[1])[h1],
                                    (blockHist + pk.histOfs[2])[h1],
                                    (blockHist + pk.histOfs[3])[h1]);


        float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
        vst1q_f32(hist0, _t0);
        vst1q_f32(hist1, _t1);

        (blockHist + pk.histOfs[0])[h0] = hist0[0];
        (blockHist + pk.histOfs[1])[h0] = hist0[1];
        (blockHist + pk.histOfs[2])[h0] = hist0[2];
        (blockHist + pk.histOfs[3])[h0] = hist0[3];

        (blockHist + pk.histOfs[0])[h1] = hist1[0];
        (blockHist + pk.histOfs[1])[h1] = hist1[1];
        (blockHist + pk.histOfs[2])[h1] = hist1[2];
        (blockHist + pk.histOfs[3])[h1] = hist1[3];
    }
#else
    for( ; k < C4; k++ )
    {
        const PixData& pk = _pixData[k];
        const float* a = gradPtr + pk.gradOfs;
        float w, t0, t1, a0 = a[0], a1 = a[1];
        const uchar* h = qanglePtr + pk.qangleOfs;
        int h0 = h[0], h1 = h[1];

        float* hist = blockHist + pk.histOfs[0];
        w = pk.gradWeight*pk.histWeights[0];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[1];
        w = pk.gradWeight*pk.histWeights[1];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[2];
        w = pk.gradWeight*pk.histWeights[2];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + pk.histOfs[3];
        w = pk.gradWeight*pk.histWeights[3];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;
    }
#endif

    normalizeBlockHistogram(blockHist);

    return blockHist;

HOGCacheTester::normalizeBlockHistogram

L2-norm

$v\to v=\frac{v}{\sqrt{\|v \|^2_2 +\epsilon^2}}$
partSum记录所有方块对应4个单元格的平方和。
第2个循环的作用是什么？

    float* hist = &_hist[0], sum = 0.0f, partSum[4];
    size_t i = 0, sz = blockHistogramSize;

#if CV_SSE2
    __m128 p0 = _mm_loadu_ps(hist);
    __m128 s = _mm_mul_ps(p0, p0);

    for (i = 4; i <= sz - 4; i += 4)
    {
        p0 = _mm_loadu_ps(hist + i);
        s = _mm_add_ps(s, _mm_mul_ps(p0, p0));
    }
    _mm_storeu_ps(partSum, s);
#elif CV_NEON
    float32x4_t p0 = vld1q_f32(hist);
    float32x4_t s = vmulq_f32(p0, p0);

    for (i = 4; i <= sz - 4; i += 4)
    {
        p0 = vld1q_f32(hist + i);
        s = vaddq_f32(s, vmulq_f32(p0, p0));
    }
    vst1q_f32(partSum, s);
#else
    partSum[0] = 0.0f;
    partSum[1] = 0.0f;
    partSum[2] = 0.0f;
    partSum[3] = 0.0f;
    for ( ; i <= sz - 4; i += 4)
    {
        partSum[0] += hist[i] * hist[i];
        partSum[1] += hist[i+1] * hist[i+1];
        partSum[2] += hist[i+2] * hist[i+2];
        partSum[3] += hist[i+3] * hist[i+3];
    }
#endif
    float t0 = partSum[0] + partSum[1];
    float t1 = partSum[2] + partSum[3];
    sum = t0 + t1;
    for ( ; i < sz; ++i)
        sum += hist[i]*hist[i];

    float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold;

对归一化直方图并进行截断，重新计算平方和，再行归一化。

    i = 0, sum = 0.0f;

#if CV_SSE2
    __m128 _scale = _mm_set1_ps(scale);
    static __m128 _threshold = _mm_set1_ps(thresh);

    __m128 p = _mm_mul_ps(_scale, _mm_loadu_ps(hist));
    p = _mm_min_ps(p, _threshold);
    s = _mm_mul_ps(p, p);
    _mm_storeu_ps(hist, p);

    for(i = 4 ; i <= sz - 4; i += 4)
    {
        p = _mm_loadu_ps(hist + i);
        p = _mm_mul_ps(p, _scale);
        p = _mm_min_ps(p, _threshold);
        s = _mm_add_ps(s, _mm_mul_ps(p, p));
        _mm_storeu_ps(hist + i, p);
    }

    _mm_storeu_ps(partSum, s);
#elif CV_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
    static float32x4_t _threshold = vdupq_n_f32(thresh);

    float32x4_t p = vmulq_f32(_scale, vld1q_f32(hist));
    p = vminq_f32(p, _threshold);
    s = vmulq_f32(p, p);
    vst1q_f32(hist, p);

    for(i = 4 ; i <= sz - 4; i += 4)
    {
        p = vld1q_f32(hist + i);
        p = vmulq_f32(p, _scale);
        p = vminq_f32(p, _threshold);
        s = vaddq_f32(s, vmulq_f32(p, p));
        vst1q_f32(hist + i, p);
    }

    vst1q_f32(partSum, s);
#else
    partSum[0] = 0.0f;
    partSum[1] = 0.0f;
    partSum[2] = 0.0f;
    partSum[3] = 0.0f;
    for( ; i <= sz - 4; i += 4)
    {
        hist[i] = std::min(hist[i]*scale, thresh);
        hist[i+1] = std::min(hist[i+1]*scale, thresh);
        hist[i+2] = std::min(hist[i+2]*scale, thresh);
        hist[i+3] = std::min(hist[i+3]*scale, thresh);
        partSum[0] += hist[i]*hist[i];
        partSum[1] += hist[i+1]*hist[i+1];
        partSum[2] += hist[i+2]*hist[i+2];
        partSum[3] += hist[i+3]*hist[i+3];
    }
#endif
    t0 = partSum[0] + partSum[1];
    t1 = partSum[2] + partSum[3];
    sum = t0 + t1;
    for( ; i < sz; ++i)
    {
        hist[i] = std::min(hist[i]*scale, thresh);
        sum += hist[i]*hist[i];
    }

    scale = 1.f/(std::sqrt(sum)+1e-3f), i = 0;
#if CV_SSE2
    __m128 _scale2 = _mm_set1_ps(scale);
    for ( ; i <= sz - 4; i += 4)
    {
        __m128 t = _mm_mul_ps(_scale2, _mm_loadu_ps(hist + i));
        _mm_storeu_ps(hist + i, t);
    }
#elif CV_NEON
    float32x4_t _scale2 = vdupq_n_f32(scale);
    for ( ; i <= sz - 4; i += 4)
    {
        float32x4_t t = vmulq_f32(_scale2, vld1q_f32(hist + i));
        vst1q_f32(hist + i, t);
    }
#endif
    for ( ; i < sz; ++i)
        hist[i] *= scale;