hyperloglog

最新推荐文章于 2025-03-19 14:30:54 发布

谛听-

最新推荐文章于 2025-03-19 14:30:54 发布

阅读量508

点赞数

分类专栏： redis

本文链接：https://blog.csdn.net/u012319493/article/details/82708633

版权

redis 专栏收录该内容

3 篇文章

订阅专栏

目的
估计一个多重集中有多少个不同的数字。

论文中的算法
http://www.lix.polytechnique.fr/Labo/Eric.Fusy/Articles/FlFuGaMe07.pdf
这里写图片描述

数学解释
https://blog.csdn.net/firenet1/article/details/77247649

步骤

将多重集中的每一个数字 $v$ 进行哈希，得到 $x$ ；
桶的下标 $j$ = $x$ 的高 $b$ 位表示的值+1；
$w$ = $x$ 的剩余低位代表的值；
$p(w)$ 为从左边起第1个1出现的位置；
更新第 $j$ 个桶中的值 $M[j]=max(M[j],p(w))$ ；

通过所有桶中值的调和平均数，得到不同数字的估计值 $E$
根据 $E$ 的大小对其修正。

redis 源码

/* Return the approximated cardinality of the set based on the armonic

 * mean of the registers values. 'hdr' points to the start of the SDS

 * representing the String object holding the HLL representation.

 *

 * If the sparse representation of the HLL object is not valid, the integer

 * pointed by 'invalid' is set to non-zero, otherwise it is left untouched.

 *

 * hllCount() supports a special internal-only encoding of HLL_RAW, that

 * is, hdr->registers will point to an uint8_t array of HLL_REGISTERS element.

 * This is useful in order to speedup PFCOUNT when called against multiple

 * keys (no need to work with 6-bit integers encoding). */

uint64_t hllCount(struct hllhdr *hdr, int *invalid) {

    double m = HLL_REGISTERS;

    double E, alpha = 0.7213/(1+1.079/m);

    int j, ez; /* Number of registers equal to 0. */



    /* We precompute 2^(-reg[j]) in a small table in order to

     * speedup the computation of SUM(2^-register[0..i]). */

    static int initialized = 0;

    static double PE[64];

    if (!initialized) {

        PE[0] = 1; /* 2^(-reg[j]) is 1 when m is 0. */

        for (j = 1; j < 64; j++) {

            /* 2^(-reg[j]) is the same as 1/2^reg[j]. */

            PE[j] = 1.0/(1ULL << j);

        }

        initialized = 1;

    }



    /* Compute SUM(2^-register[0..i]). */

    if (hdr->encoding == HLL_DENSE) {

        E = hllDenseSum(hdr->registers,PE,&ez);

    } else if (hdr->encoding == HLL_SPARSE) {

        E = hllSparseSum(hdr->registers,

                         sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez,invalid);

    } else if (hdr->encoding == HLL_RAW) {

        E = hllRawSum(hdr->registers,PE,&ez);

    } else {

        redisPanic("Unknown HyperLogLog encoding in hllCount()");

    }



    /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */

    E = (1/E)*alpha*m*m;



    /* Use the LINEARCOUNTING algorithm for small cardinalities.

     * For larger values but up to 72000 HyperLogLog raw approximation is

     * used since linear counting error starts to increase. However HyperLogLog

     * shows a strong bias in the range 2.5*16384 - 72000, so we try to

     * compensate for it. */

    if (E < m*2.5 && ez != 0) {

        E = m*log(m/ez); /* LINEARCOUNTING() */

    } else if (m == 16384 && E < 72000) {

        /* We did polynomial regression of the bias for this range, this

         * way we can compute the bias for a given cardinality and correct

         * according to it. Only apply the correction for P=14 that's what

         * we use and the value the correction was verified with. */

        double bias = 5.9119*1.0e-18*(E*E*E*E)

                      -1.4253*1.0e-12*(E*E*E)+

                      1.2940*1.0e-7*(E*E)

                      -5.2921*1.0e-3*E+

                      83.3216;

        E -= E*(bias/100);

    }

    /* We don't apply the correction for E > 1/30 of 2^32 since we use

     * a 64 bit function and 6 bit counters. To apply the correction for

     * 1/30 of 2^64 is not needed since it would require a huge set

     * to approach such a value. */

    return (uint64_t) E;

}