有这样一种应用场景,在一个海量的数据集合中查看某元素是否存在,同时希望查询速度尽可能快,存储空间尽可能少,比如URL排重,比如UV统计等等。早先数据量不大的时候首选B树,把URL、UID哈希一下做key,计数器做value,优点是准确性高,附带信息全,存储空间随用随申请,插入和查询效率马马虎虎,缺点是成本高数据量小性价比太差;后来用布隆过滤,依旧把URL、UID哈希以后映射到布隆串的比特位上,优点是准确性比较高,赋值与检索效率高,缺点是无键值对信息,存储空间大且需提前申请并初始化,128位机以下存在小概率冲突;现在用redis的HyperLogLog了,优点是太省空间了,数据可分区可持久化,赋值和查询效率高,缺点是需要容忍一定的误差,不能存储键值对信息;
给段布隆代码:
/********************************************************/
/* Author: gong_libin */
/* Date: 2017_08_01 */
/* File: CmBloom.h */
/********************************************************/
#ifndef _CMBLOOM_H
#define _CMBLOOM_H
#include "CmGlobal.h"
#include <sys/ipc.h>
#include <sys/shm.h>
namespace CmCls
{
#define CM_BLOOM_0 0x80
#define CM_BLOOM_1 0x40
#define CM_BLOOM_2 0x20
#define CM_BLOOM_3 0x10
#define CM_BLOOM_4 0x08
#define CM_BLOOM_5 0x04
#define CM_BLOOM_6 0x02
#define CM_BLOOM_7 0x01
class CCmBloom
{
public:
CCmBloom();
virtual ~CCmBloom();
void CmBloomReset();
void CmBloomDelete();
ULONG CmBloomGetSize();
UCHAR* CmBloomGetBloom();
int CmBloomCreate(key_t iKey, ULONG ulSize);
bool CmBloomCheck(ULONGLONG ullKey);
void CmBloomUpdate(ULONGLONG ullKey);
protected:
ULONG m_ulSize;
UCHAR* m_puszBloom;
private:
int m_iBloom;
};
inline bool CCmBloom::CmBloomCheck(ULONGLONG ullKey)
{
UCHAR uszMask[] = {CM_BLOOM_0, CM_BLOOM_1, CM_BLOOM_2, CM_BLOOM_3, CM_BLOOM_4, CM_BLOOM_5, CM_BLOOM_6, CM_BLOOM_7};
return uszMask[ullKey % 8] & *(m_puszBloom + ullKey % m_ulSize);
}
inline void CCmBloom::CmBloomUpdate(ULONGLONG ullKey)
{
UCHAR* pucOffset = m_puszBloom + ullKey % m_ulSize;
UCHAR uszMask[] = {CM_BLOOM_0, CM_BLOOM_1, CM_BLOOM_2, CM_BLOOM_3, CM_BLOOM_4, CM_BLOOM_5, CM_BLOOM_6, CM_BLOOM_7};
*pucOffset |= uszMask[ullKey % 8];
return;
}
} /* CmCls */
#endif /* _CMBLOOM_H */
/********************************************************/
/* Author: gong_libin */
/* Date: 2017_08_01 */
/* Bloom: CmBloom.cpp */
/********************************************************/
#include "CmBloom.h"
namespace CmCls
{
CCmBloom::CCmBloom()
{
m_ulSize = 0;
m_puszBloom = NULL;
m_iBloom = CM_FAILURE;
}
CCmBloom::~CCmBloom()
{
}
ULONG CCmBloom::CmBloomGetSize()
{
return m_ulSize;
}
UCHAR* CCmBloom::CmBloomGetBloom()
{
return m_puszBloom;
}
int CCmBloom::CmBloomCreate(key_t iKey, ULONG ulSize)
{
int iReturn = CM_SUCCESS;
if (CM_FAILURE != (m_iBloom = shmget(iKey, ulSize, IPC_CREAT | 0777))) {
m_ulSize = ulSize;
if ((void*)CM_FAILURE == (m_puszBloom = (UCHAR*)shmat(m_iBloom, NULL, 0))) {
CM_ERROR("%s\n", strerror(errno));
iReturn = CM_FAILURE;
}
}
else {
CM_ERROR("%s\n", strerror(errno));
iReturn = CM_FAILURE;
}
return iReturn;
}
void CCmBloom::CmBloomReset()
{
memset(m_puszBloom, '\0', m_ulSize);
return;
}
void CCmBloom::CmBloomDelete()
{
shmctl(m_iBloom, IPC_RMID, 0);
m_iBloom = CM_FAILURE;
m_puszBloom = NULL;
m_ulSize = 0;
return;
}
} /* CmCls */
HyperLogLog是redis在2.8.9版本后新增的,区区12KB解决了2^64个元素的基数集合问题,用概率统计理论的算法通过误差换空间的方式让人还能有什么奢望呢?
序号 | 命令及描述 |
---|---|
1 | PFADD key element [element ...] 添加指定元素到 HyperLogLog 中。 |
2 | PFCOUNT key [key ...] 返回给定 HyperLogLog 的基数估算值。 |
3 | PFMERGE destkey sourcekey [sourcekey ...] 将多个 HyperLogLog 合并为一个 HyperLogLog |
redis 127.0.0.1:6379> PFADD cookies "aaa"
1) (integer) 1
redis 127.0.0.1:6379> PFADD cookies "bbb"
1) (integer) 1
redis 127.0.0.1:6379> PFADD cookies "ccc"
1) (integer) 1
redis 127.0.0.1:6379> PFCOUNT cookies
(integer) 3
狗年到了,去年共享经济相当火,今年轮到区块链,谁知道明年是啥呢,人工智能对各个行业的冲击让许多人猝不及防,油腻的中年危机压的人喘不过气。英国BBC从1964年开始每隔七年记录来自不同社会阶层的十几个孩子的生活,纪录片从黑白胶片到彩色高清,岁月改变了他们的容貌,却没有改变他们的社会阶层,阶级固化正在我们身边悄然发生~~~