词典快速匹配方案

最新推荐文章于 2023-06-25 16:22:10 发布

丶丶路遥

最新推荐文章于 2023-06-25 16:22:10 发布

阅读量1.6k

点赞数

分类专栏： Hash算法汇总算法文章标签：高性能文本匹配

本文链接：https://blog.csdn.net/qq_25956141/article/details/102088584

版权

算法同时被 2 个专栏收录

36 篇文章 1 订阅

订阅专栏

Hash算法汇总

3 篇文章 1 订阅

订阅专栏

介绍

对于构造了几百万上千万的集合中，快速查找对应的元素是一种常见的应用场景，因此本文将探讨一种如何在一个大集合中快速查找对应元素的方法。

查找方案

哈希查找和字典树是两种高性能的查找方法。其中字典树使用最长前缀匹配法，其查找速度非常快，但由于更新时需要调整整个树，导致其更新速度较慢，因此字典树不适用于动态集合。而哈希查找解决了更新速度慢的问题，因此本文主要介绍基于哈希查找的词典匹配方法。
本文将用到两种数据结构，分别为BloomFilter和哈希表，哈希表用来构造整个词典， BloomFilter用来对不存在与集合中的元素的查找进行过滤。对于BloomFilter的原理介绍，可参考博主另外一篇博文。BloomFilter原理和实现

在这里插入图片描述

代码实现

程序为从在DICTIONARY_PATH中查找DATA_PATH中的所有元素出现的次数。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cstring>

#pragma warning(disable:4996);


#define HASHTABLE_LEN 1024 * 64		//哈希表长度
#define HASH_NUM 5						//哈希函数个数
#define BF_VECTOR_LEN 1024 * 1024		//BF长度（bit数）
#define STR_LEN 100						//字符串最大长度


const char * DICTIONARY_PATH = "C:/Users/Lenovo/Documents/Upupoo/Docker/config/文件/文本快速查找/data/patterns-127w.txt";
const char * DATA_PATH = "C:/Users/Lenovo/Documents/Upupoo/Docker/config/文件/文本快速查找/data/words-98w.txt";
const char * RESULT_PATH = "C:/Users/Lenovo/Documents/Upupoo/Docker/config/文件/文本快速查找/data/result.txt";

static float totalCount = 0;
static float errorCount = 0;

unsigned int(*m_pHash[HASH_NUM])(const char *, unsigned int);	//BF函数数组

																///哈希表结构
typedef struct DataNode {
	char world[STR_LEN];
	struct DataNode * next;
}DataNode;

typedef struct HashTable {
	struct DataNode * firstNode[HASHTABLE_LEN];
}HashTable;

//BloomFilter过滤器
typedef struct BloomFilter {
	int currentNum;
	char bitVector[BF_VECTOR_LEN];
}BloomFilter;

//初始化BF
void initBF(BloomFilter ** BF) {
	(*BF) = (BloomFilter *)malloc(sizeof(BloomFilter));
	(*BF)->currentNum = 0;
	memset((*BF)->bitVector, 0, sizeof((*BF)->bitVector));
}

//插入到BF
void insertBF(BloomFilter * BF, char * str, int len) {
	int i = 0, index;
	BF->currentNum++;
	for (i = 0; i<HASH_NUM; i++) {
		index = (*m_pHash[i])(str, len) % (BF_VECTOR_LEN * 8);
		BF->bitVector[index >> 3] |= 1 << (index & 7);
	}
}

//过滤器中是否有某个元素
int contains(BloomFilter * BF, char * str, int len) {
	int i = 0, index;
	for (i = 0; i<HASH_NUM; i++) {
		index = (*m_pHash[i])(str, len) % (BF_VECTOR_LEN * 8);
		if ((BF->bitVector[index >> 3] & (1 << (index & 7))) == 0) {
			return 0;
		}
	}
	return 1;
}

哈希函数组

unsigned int OAAT(const char *buf, unsigned int len)
{
	int hash = 0;
	int i;

	for (i = 0; i<len; ++i)
	{
		hash += buf[i];
		hash += (hash << 10);
		hash ^= (hash >> 6);
	}

	hash += (hash << 3);
	hash ^= (hash >> 11);
	hash += (hash << 15);

	return hash;
}

unsigned int RSHash(const char* str, unsigned int length)
{
	unsigned int b = 378551;
	unsigned int a = 63689;
	unsigned int hash = 0;
	unsigned int i = 0;

	for (i = 0; i < length; ++str, ++i)
	{
		hash = hash * a + (*str);
		a = a * b;
	}

	return hash;
}

unsigned int JSHash(const char* str, unsigned int length)
{
	unsigned int hash = 1315423911;
	unsigned int i = 0;

	for (i = 0; i < length; ++str, ++i)
	{
		hash ^= ((hash << 5) + (*str) + (hash >> 2));
	}

	return hash;
}

unsigned int ELFHash(const char* str, unsigned int length)
{
	unsigned int hash = 0;
	unsigned int x = 0;
	unsigned int i = 0;

	for (i = 0; i < length; ++str, ++i)
	{
		hash = (hash << 4) + (*str);

		if ((x = hash & 0xF0000000L) != 0)
		{
			hash ^= (x >> 24);
		}

		hash &= ~x;
	}

	return hash;
}

unsigned int PJWHash(const char* str, unsigned int length)
{
	const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
	const unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4);
	const unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8);
	const unsigned int HighBits =
		(unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
	unsigned int hash = 0;
	unsigned int test = 0;
	unsigned int i = 0;

	for (i = 0; i < length; ++str, ++i)
	{
		hash = (hash << OneEighth) + (*str);

		if ((test = hash & HighBits) != 0)
		{
			hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
		}
	}

	return hash;
}



///初始化哈希表
void initHashTable(HashTable ** table) {
	int i = 0;
	(*table) = (HashTable *)malloc(sizeof(HashTable));
	for (i = 0; i<HASHTABLE_LEN; i++) {
		(*table)->firstNode[i] = NULL;
	}
}

//释放哈希表
void freeHashTable(HashTable * table) {
	DataNode * p, *s;
	int i = 0;

	for (i = 0; i<HASHTABLE_LEN; i++) {
		p = table->firstNode[i];
		while (p) {
			s = p;
			p = p->next;
			free(s);
		}
	}
	free(table);
}

//将某个元素插入到哈希表
void insertToHashTable(HashTable * table, char * str) {
	unsigned int index = 0;
	DataNode * node = (DataNode *)malloc(sizeof(DataNode));
	index = OAAT(str, strlen(str)) % HASHTABLE_LEN;
	strcpy(node->world, str);
	node->next = table->firstNode[index];
	table->firstNode[index] = node;

}

//读取字典，构造哈希表
void initDictionary(const char * path, HashTable * table, BloomFilter * BF) {
	int index, strLength;
	char str[STR_LEN];
	DataNode * node;
	FILE * fp = NULL;
	if ((fp = fopen(path, "r")) == NULL) {
		printf("文件打开失败\n");
		exit(1);
	}

	node = (DataNode *)malloc(sizeof(DataNode));
	while ((fscanf(fp, "%s", node->world)) != EOF) {
		strLength = strlen(node->world);
		index = OAAT(node->world, strLength) % HASHTABLE_LEN;	//计算在哈希表中的索引
		node->next = table->firstNode[index];
		table->firstNode[index] = node;
		insertBF(BF, node->world, strLength);
		node = (DataNode *)malloc(sizeof(DataNode));
	}
	free(node);

	/*
	while((fscanf(fp,"%s",str)) != EOF){
	printf("%s\n",str);
	insertToHashTable(table,str);
	}
	*/

	if (fp) {
		fclose(fp);
	}
}

//判断某个元素在哈希表中是否存在
int existWorldInHashTable(HashTable * table, char * str, int len) {
	DataNode * p;
	int index;
	index = OAAT(str, len) % HASHTABLE_LEN;

	p = table->firstNode[index];
	while (p) {
		if (strcmp(p->world, str) == 0) {		//比较待匹配串和模式串是否相等
			return 1;
		}
		p = p->next;
	}
	return 0;
}

//判断并写出结果
void writeResult(HashTable * table, const char * srcPath, const char * resultPath, BloomFilter * BF) {
	FILE * fp, *op = NULL;
	char str[STR_LEN];
	int strLength;

	if ((fp = fopen(srcPath, "r")) == NULL) {
		printf("文件打开失败\n");
		exit(1);
	}
	if ((op = fopen(resultPath, "w")) == NULL) {
		printf("文件打开失败\n");
		exit(1);
	}

	while ((fscanf(fp, "%s", str)) != EOF) {
		strLength = strlen(str);
		++totalCount;
		if (!contains(BF, str, strLength)) {			//先查BF
			fprintf(op, "%s %s\r\n", str, "no");
			continue;
		}
		if (existWorldInHashTable(table, str, strLength)) {	//之后查哈希表
			fprintf(op, "%s %s\r\n", str, "yes");
		}
		else {
			++errorCount;
			fprintf(op, "%s %s\r\n", str, "no");
		}
	}

	if (fp) {
		fclose(fp);
	}
	if (op) {
		fclose(op);
	}
}



//主函数
int main() {
	HashTable * table;
	BloomFilter * BF;
	float bf_error = 0;

	//初始化BF
	initBF(&BF);

	//初始化哈希函数集
	m_pHash[0] = OAAT;
	m_pHash[1] = RSHash;
	m_pHash[2] = JSHash;
	m_pHash[3] = PJWHash;
	m_pHash[4] = ELFHash;

	printf("初始化哈希表\n");
	initHashTable(&table);

	initDictionary(DICTIONARY_PATH, table, BF);

	printf("开始扫描\n");
	writeResult(table, DATA_PATH, RESULT_PATH, BF);
	
	bf_error = errorCount/totalCount;
	printf("BF假阳性误判率为：%f\n",bf_error);
	
	printf("程序完成\n");
	freeHashTable(table);
	free(BF);

	return 0;
}

结论

本文简要介绍了词典快速匹配查找方案，主要介绍了使用哈希表的查找方法，并使用C语言对其进行了实现。

丶丶路遥

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
词典快速匹配方案

介绍对于构造了几百万上千万的集合中，快速查找对应的元素是一种常见的应用场景，因此本文将探讨一种如何在一个大集合中快速查找对应元素的方法。查找方案哈希查找和字典树是两种高性能的查找方法。其中字典树使用最长前缀匹配法，其查找速度非常快，但由于更新时需要调整整个树，导致其更新速度较慢，因此字典树不适用于动态集合。而哈希查找解决了更新速度慢的问题，因此本文主要介绍基于哈希查找的词典匹配方法。本文将...
复制链接

扫一扫