2021-09-06

布隆过滤器

说明

记录自己学习写下的代码。

#ifndef __MICRO_BLOOMFILTER_H__
#define __MICRO_BLOOMFILTER_H__

/**
 *
 *  仿照Cassandra中的BloomFilter实现,Hash选用MurmurHash2,通过双重散列公式生成散列函数,参考:http://hur.st/bloomfilter
 *    Hash(key, i) = (H1(key) + i * H2(key)) % m
 *
**/

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define __BLOOMFILTER_VERSION__ "1.1"
#define __MGAIC_CODE__          (0x01464C42)

/**
 *  BloomFilter使用例子:
 *  static BaseBloomFilter stBloomFilter = {0};
 *
 *  初始化BloomFilter(最大100000元素,不超过0.00001的错误率):
 *      InitBloomFilter(&stBloomFilter, 0, 100000, 0.00001);
 *  重置BloomFilter:
 *      ResetBloomFilter(&stBloomFilter);
 *  释放BloomFilter:
 *      FreeBloomFilter(&stBloomFilter);
 *
 *  向BloomFilter中新增一个数值(0-正常,1-加入数值过多):
 *      uint32_t dwValue;
 *      iRet = BloomFilter_Add(&stBloomFilter, &dwValue, sizeof(uint32_t));
 *  检查数值是否在BloomFilter内(0-存在,1-不存在):
 *      iRet = BloomFilter_Check(&stBloomFilter, &dwValue, sizeof(uint32_t));
 *
 *  (1.1新增) 将生成好的BloomFilter写入文件:
 *      iRet = SaveBloomFilterToFile(&stBloomFilter, "dump.bin")
 *  (1.1新增) 从文件读取生成好的BloomFilter:
 *      iRet = LoadBloomFilterFromFile(&stBloomFilter, "dump.bin")
**/

// 注意,要让Add/Check函数内联,必须使用 -O2 或以上的优化等级
#define FORCE_INLINE __attribute__((always_inline))

#define BYTE_BITS           (8)
#define MIX_UINT64(v)       ((uint32_t)((v>>32)^(v)))

#define SETBIT(filter, n)   (filter->pstFilter[n/BYTE_BITS] |= (1 << (n%BYTE_BITS)))
#define GETBIT(filter, n)   (filter->pstFilter[n/BYTE_BITS] & (1 << (n%BYTE_BITS)))

#pragma pack(1)

// BloomFilter结构定义
typedef struct
{
    uint8_t cInitFlag;                              // 初始化标志,为0时的第一次Add()会对stFilter[]做初始化
    uint8_t cResv[3];

    uint32_t dwMaxItems;                            // n - BloomFilter中最大元素个数 (输入量)
    double dProbFalse;                              // p - 假阳概率 (输入量,比如万分之一:0.00001)
    uint32_t dwFilterBits;                          // m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0))))); - BloomFilter的比特数
    uint32_t dwHashFuncs;                           // k = round(log(2.0) * m / n); - 哈希函数个数
    
    uint32_t dwSeed;                                // MurmurHash的种子偏移量
    uint32_t dwCount;                               // Add()的计数,超过MAX_BLOOMFILTER_N则返回失败

    uint32_t dwFilterSize;                          // dwFilterBits / BYTE_BITS
    unsigned char *pstFilter;                       // BloomFilter存储指针,使用malloc分配
    uint32_t *pdwHashPos;                           // 存储上次hash得到的K个bit位置数组(由bloom_hash填充)
} BaseBloomFilter;

// BloomFilter文件头部定义
typedef struct
{
    uint32_t dwMagicCode;                           // 文件头部标识,填充 __MGAIC_CODE__
    uint32_t dwSeed;
    uint32_t dwCount;

    uint32_t dwMaxItems;                            // n - BloomFilter中最大元素个数 (输入量)
    double dProbFalse;                              // p - 假阳概率 (输入量,比如万分之一:0.00001)
    uint32_t dwFilterBits;                          // m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0))))); - BloomFilter的比特数
    uint32_t dwHashFuncs;                           // k = round(log(2.0) * m / n); - 哈希函数个数

    uint32_t dwResv[6];
    uint32_t dwFileCrc;                             // (未使用)整个文件的校验和
    uint32_t dwFilterSize;                          // 后面Filter的Buffer长度
} BloomFileHead;

#pragma pack()


// 计算BloomFilter的参数m,k
static inline void _CalcBloomFilterParam(uint32_t n, double p, uint32_t *pm, uint32_t *pk)
{
    /**
     *  n - Number of items in the filter
     *  p - Probability of false positives, float between 0 and 1 or a number indicating 1-in-p
     *  m - Number of bits in the filter
     *  k - Number of hash functions
     *
     *  f = ln(2) × ln(1/2) × m / n = (0.6185) ^ (m/n)
     *  m = -1 * ln(p) × n / 0.6185
     *  k = ln(2) × m / n = 0.6931 * m / n
    **/

    uint32_t m, k;

    // 计算指定假阳概率下需要的比特数
    m = (uint32_t) ceil(-1 * log(p) * n / 0.6185);
    m = (m - m % 64) + 64;                  // 8字节对齐

    // 计算哈希函数个数
    k = (uint32_t) (0.6931 * m / n);
    k++;

    *pm = m;
    *pk = k;
    return;
}


// 根据目标精度和数据个数,初始化BloomFilter结构
inline int InitBloomFilter(BaseBloomFilter *pstBloomfilter, uint32_t dwSeed, uint32_t dwMaxItems, double dProbFalse)
{
    if (pstBloomfilter == NULL)
        return -1;
    if ((dProbFalse <= 0) || (dProbFalse >= 1))
        return -2;

    // 先检查是否重复Init,释放内存
    if (pstBloomfilter->pstFilter != NULL)
        free(pstBloomfilter->pstFilter);
    if (pstBloomfilter->pdwHashPos != NULL)
        free(pstBloomfilter->pdwHashPos);

    memset(pstBloomfilter, 0, sizeof(BaseBloomFilter));

    // 初始化内存结构,并计算BloomFilter需要的空间
    pstBloomfilter->dwMaxItems = dwMaxItems;
    pstBloomfilter->dProbFalse = dProbFalse;
    pstBloomfilter->dwSeed = dwSeed;

    // 计算 m, k
    _CalcBloomFilterParam(pstBloomfilter->dwMaxItems, pstBloomfilter->dProbFalse,
                    &pstBloomfilter->dwFilterBits, &pstBloomfilter->dwHashFuncs);

    // 分配BloomFilter的存储空间
    pstBloomfilter->dwFilterSize = pstBloomfilter->dwFilterBits / BYTE_BITS;
    pstBloomfilter->pstFilter = (unsigned char *) malloc(pstBloomfilter->dwFilterSize);
    if (NULL == pstBloomfilter->pstFilter)
        return -100;

    // 哈希结果数组,每个哈希函数一个
    pstBloomfilter->pdwHashPos = (uint32_t*) malloc(pstBloomfilter->dwHashFuncs * sizeof(uint32_t));
    if (NULL == pstBloomfilter->pdwHashPos)
        return -200;

    printf(">>> Init BloomFilter(n=%u, p=%f, m=%u, k=%d), malloc() size=%.2fMB\n",
            pstBloomfilter->dwMaxItems, pstBloomfilter->dProbFalse, pstBloomfilter->dwFilterBits,
            pstBloomfilter->dwHashFuncs, (double)pstBloomfilter->dwFilterSize/1024/1024);
    
    // 初始化BloomFilter的内存
    memset(pstBloomfilter->pstFilter, 0, pstBloomfilter->dwFilterSize);
    pstBloomfilter->cInitFlag = 1;
    return 0;
}

// 释放BloomFilter
inline int FreeBloomFilter(BaseBloomFilter *pstBloomfilter)
{
    if (pstBloomfilter == NULL)
        return -1;
    
    pstBloomfilter->cInitFlag = 0;
    pstBloomfilter->dwCount = 0;

    free(pstBloomfilter->pstFilter);
    pstBloomfilter->pstFilter = NULL;
    free(pstBloomfilter->pdwHashPos);
    pstBloomfilter->pdwHashPos = NULL;
    return 0;
}

// 重置BloomFilter
// 注意: Reset()函数不会立即初始化stFilter,而是当一次Add()时去memset
inline int ResetBloomFilter(BaseBloomFilter *pstBloomfilter)
{
    if (pstBloomfilter == NULL)
        return -1;
    
    pstBloomfilter->cInitFlag = 0;
    pstBloomfilter->dwCount = 0;
    return 0;
}

// 和ResetBloomFilter不同,调用后立即memset内存
inline int RealResetBloomFilter(BaseBloomFilter *pstBloomfilter)
{
    if (pstBloomfilter == NULL)
        return -1;
    
    memset(pstBloomfilter->pstFilter, 0, pstBloomfilter->dwFilterSize);
    pstBloomfilter->cInitFlag = 1;
    pstBloomfilter->dwCount = 0;
    return 0;
}

///
///  函数FORCE_INLINE,加速执行
///
// MurmurHash2, 64-bit versions, by Austin Appleby
// https://sites.google.com/site/murmurhash/
FORCE_INLINE uint64_t MurmurHash2_x64 ( const void * key, int len, uint32_t seed )
{
	const uint64_t m = 0xc6a4a7935bd1e995;
	const int r = 47;

	uint64_t h = seed ^ (len * m);

	const uint64_t * data = (const uint64_t *)key;
	const uint64_t * end = data + (len/8);

	while(data != end)
	{
		uint64_t k = *data++;

		k *= m; 
		k ^= k >> r; 
		k *= m; 
		
		h ^= k;
		h *= m;
	}

	const uint8_t * data2 = (const uint8_t*)data;

	switch(len & 7)
	{
	case 7: h ^= ((uint64_t)data2[6]) << 48;
	case 6: h ^= ((uint64_t)data2[5]) << 40;
	case 5: h ^= ((uint64_t)data2[4]) << 32;
	case 4: h ^= ((uint64_t)data2[3]) << 24;
	case 3: h ^= ((uint64_t)data2[2]) << 16;
	case 2: h ^= ((uint64_t)data2[1]) << 8;
	case 1: h ^= ((uint64_t)data2[0]);
	        h *= m;
	};
 
	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

// 双重散列封装
FORCE_INLINE void bloom_hash(BaseBloomFilter *pstBloomfilter, const void * key, int len)
{
    //if (pstBloomfilter == NULL) return;
    int i;
    uint32_t dwFilterBits = pstBloomfilter->dwFilterBits;
    uint64_t hash1 = MurmurHash2_x64(key, len, pstBloomfilter->dwSeed);
    uint64_t hash2 = MurmurHash2_x64(key, len, MIX_UINT64(hash1));
    
    for (i = 0; i < (int)pstBloomfilter->dwHashFuncs; i++)
    {
        pstBloomfilter->pdwHashPos[i] = (hash1 + i*hash2) % dwFilterBits;
    }

    return;
}

// 向BloomFilter中新增一个元素
// 成功返回0,当添加数据超过限制值时返回1提示用户
FORCE_INLINE int BloomFilter_Add(BaseBloomFilter *pstBloomfilter, const void * key, int len)
{
    if ((pstBloomfilter == NULL) || (key == NULL) || (len <= 0))
        return -1;
    
    int i;
    
    if (pstBloomfilter->cInitFlag != 1)
    {
        // Reset后没有初始化,使用前需要memset
        memset(pstBloomfilter->pstFilter, 0, pstBloomfilter->dwFilterSize);
        pstBloomfilter->cInitFlag = 1;
    }
    
    // hash key到bloomfilter中
    bloom_hash(pstBloomfilter, key, len);
    for (i = 0; i < (int)pstBloomfilter->dwHashFuncs; i++)
    {
        SETBIT(pstBloomfilter, pstBloomfilter->pdwHashPos[i]);
    }
    
    // 增加count数
    pstBloomfilter->dwCount++;
    if (pstBloomfilter->dwCount <= pstBloomfilter->dwMaxItems)
        return 0;
    else
        return 1;       // 超过N最大值,可能出现准确率下降等情况
}

// 检查一个元素是否在bloomfilter中
// 返回:0-存在,1-不存在,负数表示失败
FORCE_INLINE int BloomFilter_Check(BaseBloomFilter *pstBloomfilter, const void * key, int len)
{
    if ((pstBloomfilter == NULL) || (key == NULL) || (len <= 0))
        return -1;
    
    int i;
    
    bloom_hash(pstBloomfilter, key, len);
    for (i = 0; i < (int)pstBloomfilter->dwHashFuncs; i++)
    {
        // 如果有任意bit不为1,说明key不在bloomfilter中
        // 注意: GETBIT()返回不是0|1,高位可能出现128之类的情况
        if (GETBIT(pstBloomfilter, pstBloomfilter->pdwHashPos[i]) == 0)
            return 1;
    }
    
    return 0;
}


/* 文件相关封装 */
// 将生成好的BloomFilter写入文件
inline int SaveBloomFilterToFile(BaseBloomFilter *pstBloomfilter, char *szFileName)
{
    if ((pstBloomfilter == NULL) || (szFileName == NULL))
        return -1;

    int iRet;
    FILE *pFile;
    static BloomFileHead stFileHeader = {0};

    pFile = fopen(szFileName, "wb");
    if (pFile == NULL)
    {
        perror("fopen");
        return -11;
    }

    // 先写入文件头
    stFileHeader.dwMagicCode = __MGAIC_CODE__;
    stFileHeader.dwSeed = pstBloomfilter->dwSeed;
    stFileHeader.dwCount = pstBloomfilter->dwCount;
    stFileHeader.dwMaxItems = pstBloomfilter->dwMaxItems;
    stFileHeader.dProbFalse = pstBloomfilter->dProbFalse;
    stFileHeader.dwFilterBits = pstBloomfilter->dwFilterBits;
    stFileHeader.dwHashFuncs = pstBloomfilter->dwHashFuncs;
    stFileHeader.dwFilterSize = pstBloomfilter->dwFilterSize;

    iRet = fwrite((const void*)&stFileHeader, sizeof(stFileHeader), 1, pFile);
    if (iRet != 1)
    {
        perror("fwrite(head)");
        return -21;
    }

    // 接着写入BloomFilter的内容
    iRet = fwrite(pstBloomfilter->pstFilter, 1, pstBloomfilter->dwFilterSize, pFile);
    if ((uint32_t)iRet != pstBloomfilter->dwFilterSize)
    {
        perror("fwrite(data)");
        return -31;
    }

    fclose(pFile);
    return 0;
}

// 从文件读取生成好的BloomFilter
inline int LoadBloomFilterFromFile(BaseBloomFilter *pstBloomfilter, char *szFileName)
{
    if ((pstBloomfilter == NULL) || (szFileName == NULL))
        return -1;

    int iRet;
    FILE *pFile;
    static BloomFileHead stFileHeader = {0};

    if (pstBloomfilter->pstFilter != NULL)
        free(pstBloomfilter->pstFilter);
    if (pstBloomfilter->pdwHashPos != NULL)
        free(pstBloomfilter->pdwHashPos);

    //
    pFile = fopen(szFileName, "rb");
    if (pFile == NULL)
    {
        perror("fopen");
        return -11;
    }

    // 读取并检查文件头
    iRet = fread((void*)&stFileHeader, sizeof(stFileHeader), 1, pFile);
    if (iRet != 1)
    {
        perror("fread(head)");
        return -21;
    }

    if ((stFileHeader.dwMagicCode != __MGAIC_CODE__)
        || (stFileHeader.dwFilterBits != stFileHeader.dwFilterSize*BYTE_BITS))
        return -50;

    // 初始化传入的 BaseBloomFilter 结构
    pstBloomfilter->dwMaxItems = stFileHeader.dwMaxItems;
    pstBloomfilter->dProbFalse = stFileHeader.dProbFalse;
    pstBloomfilter->dwFilterBits = stFileHeader.dwFilterBits;
    pstBloomfilter->dwHashFuncs = stFileHeader.dwHashFuncs;
    pstBloomfilter->dwSeed = stFileHeader.dwSeed;
    pstBloomfilter->dwCount = stFileHeader.dwCount;
    pstBloomfilter->dwFilterSize = stFileHeader.dwFilterSize;

    pstBloomfilter->pstFilter = (unsigned char *) malloc(pstBloomfilter->dwFilterSize);
    if (NULL == pstBloomfilter->pstFilter)
        return -100;
    pstBloomfilter->pdwHashPos = (uint32_t*) malloc(pstBloomfilter->dwHashFuncs * sizeof(uint32_t));
    if (NULL == pstBloomfilter->pdwHashPos)
        return -200;


    // 将后面的Data部分读入 pstFilter
    iRet = fread((void*)(pstBloomfilter->pstFilter), 1, pstBloomfilter->dwFilterSize, pFile);
    if ((uint32_t)iRet != pstBloomfilter->dwFilterSize)
    {
        perror("fread(data)");
        return -31;
    }
    pstBloomfilter->cInitFlag = 1;

    printf(">>> Load BloomFilter(n=%u, p=%f, m=%u, k=%d), malloc() size=%.2fMB\n",
        pstBloomfilter->dwMaxItems, pstBloomfilter->dProbFalse, pstBloomfilter->dwFilterBits,
        pstBloomfilter->dwHashFuncs, (double)pstBloomfilter->dwFilterSize/1024/1024);

    fclose(pFile);
    return 0;
}

#endif

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
使用python中的pymsql完成如下:表结构与数据创建 1. 建立 `users` 表和 `orders` 表。 `users` 表有用户ID、用户名、年龄字段,(id,name,age) `orders` 表有订单ID、订单日期、订单金额,用户id字段。(id,order_date,amount,user_id) 2 两表的id作为主键,`orders` 表用户id为users的外键 3 插入数据 `users` (1, '张三', 18), (2, '李四', 20), (3, '王五', 22), (4, '赵六', 25), (5, '钱七', 28); `orders` (1, '2021-09-01', 500, 1), (2, '2021-09-02', 1000, 2), (3, '2021-09-03', 600, 3), (4, '2021-09-04', 800, 4), (5, '2021-09-05', 1500, 5), (6, '2021-09-06', 1200, 3), (7, '2021-09-07', 2000, 1), (8, '2021-09-08', 300, 2), (9, '2021-09-09', 700, 5), (10, '2021-09-10', 900, 4); 查询语句 1. 查询订单总金额 2. 查询所有用户的平均年龄,并将结果四舍五入保留两位小数。 3. 查询订单总数最多的用户的姓名和订单总数。 4. 查询所有不重复的年龄。 5. 查询订单日期在2021年9月1日至9月4日之间的订单总金额。 6. 查询年龄不大于25岁的用户的订单数量,并按照降序排序。 7. 查询订单总金额排名前3的用户的姓名和订单总金额。 8. 查询订单总金额最大的用户的姓名和订单总金额。 9. 查询订单总金额最小的用户的姓名和订单总金额。 10. 查询所有名字中含有“李”的用户,按照名字升序排序。 11. 查询所有年龄大于20岁的用户,按照年龄降序排序,并只显示前5条记录。 12. 查询每个用户的订单数量和订单总金额,并按照总金额降序排序。
06-03
这里是使用 Python 的 PyMySQL 完成上述任务的代码: ```python import pymysql # 连接数据库 conn = pymysql.connect(host='localhost', port=3306, user='root', password='123456', database='test') # 创建游标 cur = conn.cursor() # 创建 users 表 cur.execute(''' CREATE TABLE users ( id INT PRIMARY KEY, name VARCHAR(20), age INT ) ''') # 创建 orders 表 cur.execute(''' CREATE TABLE orders ( id INT PRIMARY KEY, order_date DATE, amount INT, user_id INT, FOREIGN KEY (user_id) REFERENCES users(id) ) ''') # 插入数据 users_data = [ (1, '张三', 18), (2, '李四', 20), (3, '王五', 22), (4, '赵六', 25), (5, '钱七', 28) ] orders_data = [ (1, '2021-09-01', 500, 1), (2, '2021-09-02', 1000, 2), (3, '2021-09-03', 600, 3), (4, '2021-09-04', 800, 4), (5, '2021-09-05', 1500, 5), (6, '2021-09-06', 1200, 3), (7, '2021-09-07', 2000, 1), (8, '2021-09-08', 300, 2), (9, '2021-09-09', 700, 5), (10, '2021-09-10', 900, 4) ] for user in users_data: cur.execute('INSERT INTO users VALUES (%s, %s, %s)', user) for order in orders_data: cur.execute('INSERT INTO orders VALUES (%s, %s, %s, %s)', order) # 提交事务 conn.commit() # 查询语句 # 1. 查询订单总金额 cur.execute('SELECT SUM(amount) FROM orders') print(cur.fetchone()[0]) # 2. 查询所有用户的平均年龄,并将结果四舍五入保留两位小数。 cur.execute('SELECT ROUND(AVG(age), 2) FROM users') print(cur.fetchone()[0]) # 3. 查询订单总数最多的用户的姓名和订单总数。 cur.execute(''' SELECT users.name, COUNT(*) AS total_orders FROM users JOIN orders ON users.id = orders.user_id GROUP BY users.id ORDER BY total_orders DESC LIMIT 1 ''') print(cur.fetchone()) # 4. 查询所有不重复的年龄。 cur.execute('SELECT DISTINCT age FROM users') print([row[0] for row in cur.fetchall()]) # 5. 查询订单日期在2021年9月1日至9月4日之间的订单总金额。 cur.execute('SELECT SUM(amount) FROM orders WHERE order_date BETWEEN "2021-09-01" AND "2021-09-04"') print(cur.fetchone()[0]) # 6. 查询年龄不大于25岁的用户的订单数量,并按照降序排序。 cur.execute(''' SELECT users.name, COUNT(*) AS total_orders FROM users JOIN orders ON users.id = orders.user_id WHERE age <= 25 GROUP BY users.id ORDER BY total_orders DESC ''') print(cur.fetchall()) # 7. 查询订单总金额排名前3的用户的姓名和订单总金额。 cur.execute(''' SELECT users.name, SUM(amount) AS total_amount FROM users JOIN orders ON users.id = orders.user_id GROUP BY users.id ORDER BY total_amount DESC LIMIT 3 ''') print(cur.fetchall()) # 8. 查询订单总金额最大的用户的姓名和订单总金额。 cur.execute(''' SELECT users.name, SUM(amount) AS total_amount FROM users JOIN orders ON users.id = orders.user_id GROUP BY users.id ORDER BY total_amount DESC LIMIT 1 ''') print(cur.fetchone()) # 9. 查询订单总金额最小的用户的姓名和订单总金额。 cur.execute(''' SELECT users.name, SUM(amount) AS total_amount FROM users JOIN orders ON users.id = orders.user_id GROUP BY users.id ORDER BY total_amount ASC LIMIT 1 ''') print(cur.fetchone()) # 10. 查询所有名字中含有“李”的用户,按照名字升序排序。 cur.execute('SELECT * FROM users WHERE name LIKE "%李%" ORDER BY name ASC') print(cur.fetchall()) # 11. 查询所有年龄大于20岁的用户,按照年龄降序排序,并只显示前5条记录。 cur.execute('SELECT * FROM users WHERE age > 20 ORDER BY age DESC LIMIT 5') print(cur.fetchall()) # 12. 查询每个用户的订单数量和订单总金额,并按照总金额降序排序。 cur.execute(''' SELECT users.name, COUNT(*) AS total_orders, SUM(amount) AS total_amount FROM users JOIN orders ON users.id = orders.user_id GROUP BY users.id ORDER BY total_amount DESC ''') print(cur.fetchall()) # 关闭游标和连接 cur.close() conn.close() ``` 注意:在运行代码之前,需要先安装 PyMySQL 模块,可以使用以下命令进行安装: ``` pip install pymysql ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值