C++ 数据结构算法学习笔记(24) - 哈希表的企业级应用案例

最新推荐文章于 2024-07-07 23:47:58 发布

俊诚_CCS

最新推荐文章于 2024-07-07 23:47:58 发布

阅读量710

点赞数 7

分类专栏： C++ 学习笔记文章标签：数据结构算法 c++

本文链接：https://blog.csdn.net/2201_75704074/article/details/138766627

版权

C++ 学习笔记专栏收录该内容

35 篇文章 10 订阅

订阅专栏

C++ 数据结构算法学习笔记(24) - 哈希表的企业级应用案例

淘宝分布式文件系统

根据淘宝 2016 年的数据分析，淘宝卖家已经达到 900 多万，有上十亿的商品。每一个商品有包括大量的图片和文字(平均：15k)，粗略估计下，数据所占的存储空间在 1PB 以上，如果使用单块容量为 1T 容量的磁盘来保存数据，那么也需要 1024 x 1024 块磁盘来保存.

在这里插入图片描述

思考？这么大的数据量，应该怎么保存呢？就保存在普通的单个文件中或单台服务器中吗？显然是不可行的。淘宝针对海量非结构化数据存储设计出了的一款分布式系统,叫 TFS,它构筑在普通的 Linux 机器集群上,可为外部提供高可靠和高并发的存储访问.

设计思路

以 block 文件的形式存放数据文件(一般 64M 一个 block),以下简称为“块”，每个块都有唯一的一个整数编号，块在使用之前所用到的存储空间都会预先分配和初始化。每一个块由一个索引文件、一个主块文件和若干个扩展块组成，“小文件”主要存放在主块中，扩展块主要用来存放溢出的数据。每个索引文件存放对应的块信息和“小文件”索引信息，索引文件会在服务启动是映射（mmap）到内存，以便极大的提高文件检索速度。“小文件”索引信息采用在索引文件中的数据结构哈希链表来实现。每个文件有对应的文件编号，文件编号从 1 开始编号，依次递增，同时作为哈希查找算法的 Key 来定位“小文件”在主块和扩展块中的偏移量。文件编号+块编号按某种算法可得到“小文件”对应的文件名

在这里插入图片描述

哈希链表实现

键(key)：文件的编号如， 1 、 5 、 19 。。。

值(value)：文件的索引信息（包含文件大小、位置）

索引：数组的下标(0,1,2,3,4) ，用以快速定位和检索数据

哈希桶：保存索引的数组，数组成员为索引值相同的多个元素（以链表的形式链接）

哈希函数: 将文件编号映射到索引上，采用求余法，如：文件编号 1

在这里插入图片描述

实现项目 - DNA 检测字符串匹配

随着生物基因测试的技术成熟，科学家们可以通过基因相似度检测，现在要对 N 个人进行测试基因测试，通过基因检测是否为色盲。

在这里插入图片描述

测试色盲的基因组包含 8 位基因，编号 1 至 8。每一位基因都可以用一个字符来表示，这个字符是’A’、‘B’、‘C’、'D’四个字符之一。

如：ABDBCBAD

通过认真观察研究，生物学家发现，有时候可能通过特定的连续几位基因，就能区分开是正常者还是色盲者。对于色盲基因，不需要 8 位基因，只需要看其中连续的 4 位基因就可以判定是正常者还是色盲者，

这 4 位基因编号分别是：（第 2、3、4、5）。也就是说，只需要把第 2,3,4,5 这四位连续的基因与色盲基因库的记录对比，就能判定该人是正常者还是色盲者。假设给定的色盲基因库如下：

ADBB

BDDC

CDBC

BDBB …

请测试下列的基因是否为色盲

AADBBBAD

ABDDCBAA

CCDBCBAA

ABDBBBAC

ABDBCBAD

ABDDBBAD

解答思路

可以直接把待测试基因的 2,3,4,5 位直接与基因库里的记录逐一对比，但如果色盲基因库很庞大，程序执行效率很低
可以使用哈希表来存储色盲基因库数据，通过哈希函数把 4 位色盲基因映射到哈希表中，大大提高检索的效率.

实现代码:

DNA_HashTable.h

#pragma once
#define MAX_SIZE	128

typedef struct _LinkNode
{
	struct _LinkNode* next;
	const void* data;
	const void* key;
}LinkNode;

typedef _LinkNode* Link_Node;
typedef _LinkNode* Element;

typedef struct _Hash_Table
{
	int size;
	Element* TheLists;
}Hash_Table;

DNA_HashTable.cpp

#include "DNA_HashTable.h"
#include <iostream>
#include <string>

#define BUCKET_SIZE 1024
#define compare(a,b) strcmp((const char*)a, (const char*)b)

using namespace std;

Hash_Table* Init_Hash(int size)
{
	if (size <= 0)
	{
		size = MAX_SIZE;
	}
	Hash_Table* new_table = NULL;
	new_table = new Hash_Table;

	if (!new_table)
	{
		cout << "The Hash table is not able to init due to unknown reason on allocate memory for 'new_table'";
		return NULL;
	}

	Element* new_element = NULL;
	new_element = new Element[size];

	new_table->size = size;

	if (new_element == NULL)
	{
		cout << "The Hash table is not able to init due to the unknown reason on allocate memory for 'new_elementl' ";
		delete new_table;
		return NULL;
	}
	new_table->TheLists = new_element;

	for (int i = 0; i < size; i++)
	{
		Link_Node new_node = NULL;
		new_node = new LinkNode;

		if (!new_node)
		{
			cout << "The Hash table is not able to init due to the unknown reason on allocate memory for 'new_node' ";
			delete new_element;
			delete new_table;
			return NULL;
		}
		else
		{
			new_table->TheLists[i] = new_node;
			memset(new_table->TheLists[i], 0, sizeof(LinkNode));
		}

		
	}
	return new_table;
}

static unsigned int Hash_Function(const void* key)
{
	if (!key)
	{
		cout << "The Hash_Function having error because the key for the element is invalid" << endl;
		return 0;
	}
	unsigned int hash = 0;
	char* str = (char*)key;
	while (*str)
	{
		// equivalent to: hash = 65599*hash + (*str++);
		hash = (*str++) + (hash << 6) + (hash << 16) - hash;
	}
	return (hash & 0x7FFFFFFF);
}

int Hash(const void* key, int TableSize)
{

	if (!key)
	{
		cout << "The key is invalid in Hash Function" << endl;
		return -1;
	}

	if (TableSize <= 0)
	{
		cout << "The table size is invalid!" << endl;
		return -1;
	}
	return Hash_Function(key) % TableSize;
}

LinkNode* Find(Hash_Table* table,const void* key)
{
	if (!key ||!table)
	{
		cout << "There is an eror on the Find function because the key for the element is invalid" << endl;
	}
	int value = Hash(key,table->size);

	Element tmp = NULL;
	tmp = table->TheLists[value]->next;

	while (tmp && compare(tmp->key, key) != 0) tmp = tmp->next;

	return tmp;
}

bool Insert_Hash(Hash_Table* table, const void* data, const void* key)
{
	if (!table)
	{
		cout << "The Insert Hash function return error because the table is not initiallized" << endl;
		return false;
	}
	Link_Node tmp = NULL;

	tmp = Find(table, key);
	if (!tmp) //if the key not able to find inside the hash_table
	{
		Element new_node = NULL;

		new_node = new LinkNode;
		if (!new_node)
		{
			cout << "Error while allocate memory to the new_node" << endl;
		}
		new_node->data = data;
		new_node->key = key;

		int value = Hash(key,table->size);

		new_node->next = table->TheLists[value]->next;
		table->TheLists[value]->next = new_node;

	}
	else
	{
		cout << "The key is already inside the hash Table" << endl;
		return false;
	}
	return true;
	
}

bool Delete(Hash_Table* table, const void* key)
{
	if (!table || !key)
	{
		cout << "The element inside the Hash_Table is not able to delete" << endl;
		return false;
	}
	Link_Node tmp = NULL;
	Link_Node last = NULL;
	int value = Hash(key,table->size);
	tmp = table->TheLists[value]->next;
	while (tmp && tmp->key != key) 
	{
		last = tmp;
		tmp = tmp->next;
	}

	if (!tmp)
	{
		cout << "The element is not able to found inside the hash table" << endl;
		return false;
	}
	else
	{
		last->next = tmp->next;
		delete tmp;
		return true;
	}
}

const void* Retrieve(Element e)
{
	if(!e)
	{ 
		cout << "The element is not able to find inside the hash table " << endl;
		return NULL;
	}
	else
	{
		return e->data;
	}
}

void Destroy(Hash_Table* table)
{
	if (!table)
	{
		cout << "There is an error in destroy function because the input table is invalid" << endl;
		return;
	}
	LinkNode* tmp = NULL;
	LinkNode* next = NULL;

	for (int i = 0; i < table->size; i++)
	{
		tmp = table->TheLists[i];
		next = table->TheLists[i]->next;
		while (tmp != NULL)
		{
			delete tmp;
			tmp = next;
			tmp = tmp->next;
		}
	}
	delete (table->TheLists);
	delete table;
}

int main()
{
	const char* elems[] = { "ADBB","BDDC","CDBC","BDBB" };
	const char* tester = "ABDDCBAC";
	char cur[5] = { '\0' };
	int i = 0;
	Hash_Table* HashTable = NULL;
	HashTable = Init_Hash(BUCKET_SIZE);
	Insert_Hash(HashTable, elems[0], elems[0]);
	Insert_Hash(HashTable, elems[1], elems[1]);
	Insert_Hash(HashTable, elems[2], elems[2]);
	Insert_Hash(HashTable, elems[3], elems[3]);
	//Delete(HashTable, elems[0]);
	strncpy_s(cur, tester + 1, 4);//ADBB'\0'
	Element e = Find(HashTable, cur);
	if (e) {
		printf("%s\n", (const char*)Retrieve(e));
	}
	else {
		printf("Not found [key:%s]\n", cur);
	}
	system("pause");
	return 0;
}

俊诚_CCS

关注

7
点赞
踩
21

收藏

觉得还不错? 一键收藏
0
评论
C++ 数据结构算法学习笔记(24) - 哈希表的企业级应用案例

根据淘宝 2016 年的数据分析，淘宝卖家已经达到 900 多万，有上十亿的商品。每一个商品有包括大量的图片和文字(平均：15k)，粗略估计下，数据所占的存储空间在 1PB 以上，如果使用单块容量为 1T 容量的磁盘来保存数据，那么也需要 1024 x 1024 块磁盘来保存.思考？这么大的数据量，应该怎么保存呢？就保存在普通的单个文件中或单台服务器中吗？显然是不可行的。
复制链接

扫一扫