介绍
对于构造了几百万上千万的集合中,快速查找对应的元素是一种常见的应用场景,因此本文将探讨一种如何在一个大集合中快速查找对应元素的方法。
查找方案
哈希查找和字典树是两种高性能的查找方法。其中字典树使用最长前缀匹配法,其查找速度非常快,但由于更新时需要调整整个树,导致其更新速度较慢,因此字典树不适用于动态集合。而哈希查找解决了更新速度慢的问题,因此本文主要介绍基于哈希查找的词典匹配方法。
本文将用到两种数据结构,分别为BloomFilter和哈希表,哈希表用来构造整个词典, BloomFilter用来对不存在与集合中的元素的查找进行过滤。对于BloomFilter的原理介绍,可参考博主另外一篇博文。BloomFilter原理和实现
代码实现
程序为从在DICTIONARY_PATH中查找DATA_PATH中的所有元素出现的次数。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cstring>
#pragma warning(disable:4996);
#define HASHTABLE_LEN 1024 * 64 //哈希表长度
#define HASH_NUM 5 //哈希函数个数
#define BF_VECTOR_LEN 1024 * 1024 //BF长度(bit数)
#define STR_LEN 100 //字符串最大长度
const char * DICTIONARY_PATH = "C:/Users/Lenovo/Documents/Upupoo/Docker/config/文件/文本快速查找/data/patterns-127w.txt";
const char * DATA_PATH = "C:/Users/Lenovo/Documents/Upupoo/Docker/config/文件/文本快速查找/data/words-98w.txt";
const char * RESULT_PATH = "C:/Users/Lenovo/Documents/Upupoo/Docker/config/文件/文本快速查找/data/result.txt";
static float totalCount = 0;
static float errorCount = 0;
unsigned int(*m_pHash[HASH_NUM])(const char *, unsigned int); //BF函数数组
///哈希表结构
typedef struct DataNode {
char world[STR_LEN];
struct DataNode * next;
}DataNode;
typedef struct HashTable {
struct DataNode * firstNode[HASHTABLE_LEN];
}HashTable;
//BloomFilter过滤器
typedef struct BloomFilter {
int currentNum;
char bitVector[BF_VECTOR_LEN];
}BloomFilter;
//初始化BF
void initBF(BloomFilter ** BF) {
(*BF) = (BloomFilter *)malloc(sizeof(BloomFilter));
(*BF)->currentNum = 0;
memset((*BF)->bitVector, 0, sizeof((*BF)->bitVector));
}
//插入到BF
void insertBF(BloomFilter * BF, char * str, int len) {
int i = 0, index;
BF->currentNum++;
for (i = 0; i<HASH_NUM; i++) {
index = (*m_pHash[i])(str, len) % (BF_VECTOR_LEN * 8);
BF->bitVector[index >> 3] |= 1 << (index & 7);
}
}
//过滤器中是否有某个元素
int contains(BloomFilter * BF, char * str, int len) {
int i = 0, index;
for (i = 0; i<HASH_NUM; i++) {
index = (*m_pHash[i])(str, len) % (BF_VECTOR_LEN * 8);
if ((BF->bitVector[index >> 3] & (1 << (index & 7))) == 0) {
return 0;
}
}
return 1;
}
哈希函数组
unsigned int OAAT(const char *buf, unsigned int len)
{
int hash = 0;
int i;
for (i = 0; i<len; ++i)
{
hash += buf[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
unsigned int RSHash(const char* str, unsigned int length)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
unsigned int i = 0;
for (i = 0; i < length; ++str, ++i)
{
hash = hash * a + (*str);
a = a * b;
}
return hash;
}
unsigned int JSHash(const char* str, unsigned int length)
{
unsigned int hash = 1315423911;
unsigned int i = 0;
for (i = 0; i < length; ++str, ++i)
{
hash ^= ((hash << 5) + (*str) + (hash >> 2));
}
return hash;
}
unsigned int ELFHash(const char* str, unsigned int length)
{
unsigned int hash = 0;
unsigned int x = 0;
unsigned int i = 0;
for (i = 0; i < length; ++str, ++i)
{
hash = (hash << 4) + (*str);
if ((x = hash & 0xF0000000L) != 0)
{
hash ^= (x >> 24);
}
hash &= ~x;
}
return hash;
}
unsigned int PJWHash(const char* str, unsigned int length)
{
const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
const unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4);
const unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8);
const unsigned int HighBits =
(unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
unsigned int hash = 0;
unsigned int test = 0;
unsigned int i = 0;
for (i = 0; i < length; ++str, ++i)
{
hash = (hash << OneEighth) + (*str);
if ((test = hash & HighBits) != 0)
{
hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return hash;
}
///初始化哈希表
void initHashTable(HashTable ** table) {
int i = 0;
(*table) = (HashTable *)malloc(sizeof(HashTable));
for (i = 0; i<HASHTABLE_LEN; i++) {
(*table)->firstNode[i] = NULL;
}
}
//释放哈希表
void freeHashTable(HashTable * table) {
DataNode * p, *s;
int i = 0;
for (i = 0; i<HASHTABLE_LEN; i++) {
p = table->firstNode[i];
while (p) {
s = p;
p = p->next;
free(s);
}
}
free(table);
}
//将某个元素插入到哈希表
void insertToHashTable(HashTable * table, char * str) {
unsigned int index = 0;
DataNode * node = (DataNode *)malloc(sizeof(DataNode));
index = OAAT(str, strlen(str)) % HASHTABLE_LEN;
strcpy(node->world, str);
node->next = table->firstNode[index];
table->firstNode[index] = node;
}
//读取字典,构造哈希表
void initDictionary(const char * path, HashTable * table, BloomFilter * BF) {
int index, strLength;
char str[STR_LEN];
DataNode * node;
FILE * fp = NULL;
if ((fp = fopen(path, "r")) == NULL) {
printf("文件打开失败\n");
exit(1);
}
node = (DataNode *)malloc(sizeof(DataNode));
while ((fscanf(fp, "%s", node->world)) != EOF) {
strLength = strlen(node->world);
index = OAAT(node->world, strLength) % HASHTABLE_LEN; //计算在哈希表中的索引
node->next = table->firstNode[index];
table->firstNode[index] = node;
insertBF(BF, node->world, strLength);
node = (DataNode *)malloc(sizeof(DataNode));
}
free(node);
/*
while((fscanf(fp,"%s",str)) != EOF){
printf("%s\n",str);
insertToHashTable(table,str);
}
*/
if (fp) {
fclose(fp);
}
}
//判断某个元素在哈希表中是否存在
int existWorldInHashTable(HashTable * table, char * str, int len) {
DataNode * p;
int index;
index = OAAT(str, len) % HASHTABLE_LEN;
p = table->firstNode[index];
while (p) {
if (strcmp(p->world, str) == 0) { //比较待匹配串和模式串是否相等
return 1;
}
p = p->next;
}
return 0;
}
//判断并写出结果
void writeResult(HashTable * table, const char * srcPath, const char * resultPath, BloomFilter * BF) {
FILE * fp, *op = NULL;
char str[STR_LEN];
int strLength;
if ((fp = fopen(srcPath, "r")) == NULL) {
printf("文件打开失败\n");
exit(1);
}
if ((op = fopen(resultPath, "w")) == NULL) {
printf("文件打开失败\n");
exit(1);
}
while ((fscanf(fp, "%s", str)) != EOF) {
strLength = strlen(str);
++totalCount;
if (!contains(BF, str, strLength)) { //先查BF
fprintf(op, "%s %s\r\n", str, "no");
continue;
}
if (existWorldInHashTable(table, str, strLength)) { //之后查哈希表
fprintf(op, "%s %s\r\n", str, "yes");
}
else {
++errorCount;
fprintf(op, "%s %s\r\n", str, "no");
}
}
if (fp) {
fclose(fp);
}
if (op) {
fclose(op);
}
}
//主函数
int main() {
HashTable * table;
BloomFilter * BF;
float bf_error = 0;
//初始化BF
initBF(&BF);
//初始化哈希函数集
m_pHash[0] = OAAT;
m_pHash[1] = RSHash;
m_pHash[2] = JSHash;
m_pHash[3] = PJWHash;
m_pHash[4] = ELFHash;
printf("初始化哈希表\n");
initHashTable(&table);
initDictionary(DICTIONARY_PATH, table, BF);
printf("开始扫描\n");
writeResult(table, DATA_PATH, RESULT_PATH, BF);
bf_error = errorCount/totalCount;
printf("BF假阳性误判率为:%f\n",bf_error);
printf("程序完成\n");
freeHashTable(table);
free(BF);
return 0;
}
结论
本文简要介绍了词典快速匹配查找方案,主要介绍了使用哈希表的查找方法,并使用C语言对其进行了实现。