Bloom filter的实现以及常用的hash函数
bloom_filter.c
参考:
bloom filter利用时间换空间的思想,利用多个哈希函数,将一个元素的存在状态映射到多个bit中,特别是在网络环境中,BF具有广泛的用途,
关键问题就是要减少false positive rate(可以设置参数来调节),扩展有 counting BF。这里选用的hash函数是表现较好的 BKDRHash ,
SDBMHash, DJBHash 。
Bloom-filter代码:
bloom_filter.h
#ifndef __BLOOM_FILTER_H__
#define __BLOOM_FILTER_H__
#include<stdlib.h>
typedef unsigned int (*hashfunc_t)(const char *, unsigned int len);
typedef struct {
size_t asize;
unsigned char *a; //to store the state of existence bits
size_t nfuncs;
hashfunc_t *funcs; // hash funcs
} BLOOM;
BLOOM *bloom_create(size_t size, size_t nfuncs, ...);
int bloom_destroy(BLOOM *bloom);
int bloom_add(BLOOM *bloom, const char *s, unsigned int len);
int bloom_check(BLOOM *bloom, const char *s, unsigned int len);
#endif
bloom_filter.c
#include<limits.h>
#include<stdarg.h>
#include"bloom_filter.h"
#define SETBIT(a, n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
#define GETBIT(a, n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT)))
BLOOM *bloom_create(size_t size, size_t nfuncs, ...)
{
BLOOM *bloom;
va_list l;
int n;
if(!(bloom=malloc(sizeof(BLOOM)))) return NULL;
// ceil the number of char to malloc for the bloom
if(!(bloom->a=calloc((size+CHAR_BIT-1)/CHAR_BIT, sizeof(char)))) {
free(bloom);
return NULL;
}
if(!(bloom->funcs=(hashfunc_t*)malloc(nfuncs*sizeof(hashfunc_t)))) {
free(bloom->a);
free(bloom);
return NULL;
}
va_start(l, nfuncs);
for(n=0; n<nfuncs; ++n) {
bloom->funcs[n]=va_arg(l, hashfunc_t);
}
va_end(l);
bloom->nfuncs=nfuncs;
bloom->asize=size;
return bloom;
}
int bloom_destroy(BLOOM *bloom)
{
free(bloom->a);
free(bloom->funcs);
free(bloom);
return 0;
}
int bloom_add(BLOOM *bloom, const char *s, unsigned int len)
{
size_t n;
for(n=0; n<bloom->nfuncs; ++n) {
SETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize);
}
return 0;
}
int bloom_check(BLOOM *bloom, const char *s, unsigned int len)
{
size_t n;
for(n=0; n<bloom->nfuncs; ++n) {
if(!(GETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize))) return 0;
}
return 1;
}
常用hash函数文件:
#include "global.h"
// Author: Arash Partow - 2002
unsigned int RSHash(char* str, unsigned int len)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = hash * a + (*str);
a = a * b;
}
return hash;
}
/* End Of RS Hash Function */
unsigned int JSHash(char* str, unsigned int len)
{
unsigned int hash = 1315423911;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash ^= ((hash << 5) + (*str) + (hash >> 2));
}
return hash;
}
/* End Of JS Hash Function */
unsigned int PJWHash(char* str, unsigned int len)
{
const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
const unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4);
const unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8);
const unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
unsigned int hash = 0;
unsigned int test = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = (hash << OneEighth) + (*str);
if((test = hash & HighBits) != 0)
{
hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return hash;
}
/* End Of P. J. Weinberger Hash Function */
unsigned int ELFHash(char* str, unsigned int len)
{
unsigned int hash = 0;
unsigned int x = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = (hash << 4) + (*str);
if((x = hash & 0xF0000000L) != 0)
{
hash ^= (x >> 24);
}
hash &= ~x;
}
return hash;
}
/* End Of ELF Hash Function */
/*This hash function comes from Brian Kernighan and Dennis Ritchie's book "The C Programming Language".
* It is a simple hash function using a strange set of possible seeds which all constitute a pattern
* of 31....31...31 etc, it seems to be very similar to the DJB hash function.
*/
unsigned int BKDRHash(char* str, unsigned int len)
{
unsigned int seed = 131; /* 31 131 1313 13131 131313 etc.. */
unsigned int hash = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = (hash * seed) + (*str);
}
return hash;
}
/* End Of BKDR Hash Function */
/*This is the algorithm of choice which is used in the open source SDBM project.
* The hash function seems to have a good over-all distribution for many different
* data sets. It seems to work well in situations where there is a high variance
* in the MSBs of the elements in a data set.
*/
unsigned int SDBMHash(char* str, unsigned int len)
{
unsigned int hash = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = (*str) + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
/* End Of SDBM Hash Function */
/*An algorithm produced by Professor Daniel J. Bernstein and shown first
* to the world on the usenet newsgroup comp.lang.c. It is one of the most
* efficient hash functions ever published.
*/
unsigned int DJBHash(char* str, unsigned int len)
{
unsigned int hash = 5381;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = ((hash << 5) + hash) + (*str);
}
return hash;
}
/* End Of DJB Hash Function */
unsigned int DEKHash(char* str, unsigned int len)
{
unsigned int hash = len;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = ((hash << 5) ^ (hash >> 27)) ^ (*str);
}
return hash;
}
/* End Of DEK Hash Function */
unsigned int BPHash(char* str, unsigned int len)
{
unsigned int hash = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash = hash << 7 ^ (*str);
}
return hash;
}
/* End Of BP Hash Function */
unsigned int FNVHash(char* str, unsigned int len)
{
const unsigned int fnv_prime = 0x811C9DC5;
unsigned int hash = 0;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash *= fnv_prime;
hash ^= (*str);
}
return hash;
}
/* End Of FNV Hash Function */
unsigned int APHash(char* str, unsigned int len)
{
unsigned int hash = 0xAAAAAAAA;
unsigned int i = 0;
for(i = 0; i < len; str++, i++)
{
hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (*str) * (hash >> 3)) :
(~((hash << 11) + (*str) ^ (hash >> 5)));
}
return hash;
}
/* End Of AP Hash Function */
测试程序:这里是先对文件分块,而后对指纹进行BF,以达到节省空间的目的。
#include "global.h"
int main(int argc, char ** argv){
int i = 0, id = 1, result, count=0;
int sockfd, fd;
FileInfo *fi=file_new();
FingerChunk *p;
BLOOM *bloom = NULL;
if(argc != 2)
err_quit("usage:bloom <file>");
//creat a bloom filter .TODO what size is best??
if(!(bloom=bloom_create(2500000, 3, BKDRHash, SDBMHash, DJBHash))) {
fprintf(stderr, "ERROR: Could not create bloom filter\n");
return EXIT_FAILURE;
}
// chunk the file
strcpy(fi->file_path, argv[1]);
chunk_file(fi);
printf("File size : %lld\n",fi->file_size);
printf("Chunk Num : %d\n",fi->chunknum);
p= fi->first;
while(p){
char hash[41];
digestToHash(p->chunk_hash,hash);
printf("chunklen : %d , Fingerprint : %s\n", p->chunklen, hash); // just
//add each signature to the bloom filter
bloom_add(bloom, p->chunk_hash, 20); //
p=p->next;
}
//How to test the false positive ??
p = fi->first;
printf("1.BKDR Hash Function value : %u\n", BKDRHash(p->chunk_hash, 20));
printf("2.SDBM Hash Function value : %u\n", SDBMHash(p->chunk_hash, 20));
printf("3.DJB Hash Function value : %u\n", DJBHash(p->chunk_hash, 20));
bloom_destroy(bloom);
file_free(fi);
}
参考:
1. http://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html
2. http://www.partow.net/programming/hashfunctions/#top