哈希Hash

又是这货

已于 2022-05-24 20:23:46 修改

阅读量370

点赞数 1

分类专栏： c++ 文章标签：哈希算法算法散列表

于 2022-02-26 21:22:17 首次发布

本文链接：https://blog.csdn.net/qq_55956945/article/details/123149306

版权

c++ 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

文章目录

unordered_set
闭散列线性探测
开散列哈希桶
unordered_set.h
unordered_map.h
BitSet.h位图
布隆过滤器BloomFilter.h
海量数据

unordered_set

#include<iostream>
#include<vector>
#include<unordered_set>
#include<set>
#include<time.h>

using namespace std;

void test_time()
{
	int n = 1000000;
	vector<int> v;
	srand(time(0)); //初始化随机数发生器
	for (int i = 0; i < n; ++i)
	{
		v.push_back(rand()); //随机数发生器
	}

	set<int> s;
	size_t begin1 = clock();//进程运行之后到这里时的毫秒数

	for (auto e : v)
	{
		s.insert(e);
	}
	size_t end1 = clock();

	cout << "set : " << end1 - begin1 << endl;


	unordered_set<int> us;
	size_t begin2 = clock();

	for (auto e : v)
	{
		us.insert(e);
	}
	size_t end2 = clock();

	cout << "unordered_set : " << end2 - begin2 << endl;
}

int main()
{
	test_time();
}

set : 9065
unordered set : 3835

STL里的算法在实际中少使用

闭散列线性探测

set-> HashTable<K, K>
map<K, V>-> HashTable<K, pair<K, V>>
HashTable.h:

namespace Close
{	enum State
	{
		EMPTY,
		EXIST,
		DELETE 
//删除状态的意义：避免干扰后面的查找；伪删除：没有删值，只是把状态修改，不改变结构
	};//枚举：常量集合

	template<class T>
	struct HashNode//节点里有值、状态
	{
		State _state = EMPTY;//内置类型用缺省值初始化
		T _t;
	};

	template<class K>
	struct Hash
	{
		size_t operator()(const K& key)
		{
			return key;
		}
	};
	//模板特化
	template<>
	struct Hash < string >
	{
		size_t operator()(const string& s)
		{
			size_t hash = 0;
			for (auto ch : s)
			{
				hash += ch;
				hash = hash * 131 + ch; //字符串哈希算法
			}
			return hash;
		}
	};
	//使用红黑树来封装map和set是一样的，由第二个模板参数来控制存储的类型是K,还是pair<const K,V>
	template<class K, class T, class HashFunc = Hash<K>>
//K是string时，需要把K转换为整形，使用模板特化
	class HashTable
	{
	public:
		bool Insert(const T& t)
		{
			if (_tables.size() == 0 || _size * 10 / _tables.size() == 7)//或
			{
			size_t newsize = _tables.size() == 0 ? 10 : _tables.size() * 2；//单纯的把原来的数据拷贝到新空间		load factor负载因子	
//newsize是不对的，可能原来冲突的数据，由于空间的扩容，不再是冲突的了；应该开辟好一段空间，重新对上面的数据进行计算，然后放入这段空间内，再释放原来的空间,开新的2倍的vector
/*方法1：vector<hashnode<t>> newtables;
newtables.resize(newsize);
原空间上的数据_tables，重新计算放到相对应的新空间上
for (size_t i = 0; i < _tables.size(); ++i)
{
	if (_tables[i]._state == exist)
	{
		//线性探测找在新表中的位置
	}
}
newtables.swap(_tables);*/

				HashTable<K, T, HashFunc> newht;//方法2：定义哈希表对象
				newht._tables.resize(newsize);//类里通过对象访问
				for (auto& e : _tables)
				{//遍历旧表，旧表this不改动，一旦改动则映射关系乱完了
					if (e._state == EXIST)
					{newht.Insert(e._t); //数据插入newht,复用冲突时探测的逻辑
//直接重新来一遍插入,重新计算位置，放在新的表中,不会再次发生增容，空间足够
					}
				}
				_tables.swap(newht._tables);
			}
			//不允许数据冗余
			HashNode<T>*  ret = Find(t);
			if (ret)
				return false;

			HashFunc hf;//仿函数HashFunc目的：转化为int,
			//一开始的时候，你的哈希表里面是没有元素的，所以会出现模0，出错情况
			size_t start = hf(t) % _tables.size();
			size_t index = start; //在二次探测的时候方便修改
			//线性探测,找一个空位置
			size_t i = 1;
			while (_tables[index]._state == EXIST) //应该去找下一个空位置
			{
				index = start + i;
				index %= _tables.size();
				++i;
			}
			//跳出循环就是两种情况1.EMPTY  2. DELETE这两种情况都可以把值放进去
			_tables[index]._t = t;//把值放进去
			_tables[index]._state = EXIST;//修改状态
			_size++;

			return true;
		}

		HashNode<T>* Find(const K& key)//vecter里每个数据是HashNode
		{
			HashFunc hf;
			size_t start = hf(key) % _tables.size();//调用string的operator(),转为int之后可以取模计算
			size_t index = start;
			size_t i = 1;
			while (_tables[index]._state != EMPTY)
			{
				if (_tables[index]._t == key && _tables[index]._state == EXIST) //因为可能找到的这个值被删除掉了
				{
					return &_tables[index];//_tables里每个数据是node,取地址，返回地址
				}
				index = start + i;
				index %= _tables.size();
				++i;
			}
			return nullptr;
		}

		bool Erase(const K& key)
		{
			HashNode<T>* ret = Find(key);
			if (ret == nullptr)
			{
				return false;
			}
			else
			{
				//伪删除
				ret->_state = DELETE;
				return true;
			}
		}
	private:
		vector<HashNode<T>> _tables;//自定义类型：调自己的拷贝构造；深拷贝由vector完成：使用库的好处，空间由vector开的，出了作用域调自己的析构函数（编译器默认生成的），每个位置存HashNode；vector里存节点
		size_t _size = 0; //有效数据的个数,空间都没有，这里不是初始化，内置类型：值拷贝
	};
}

字符串哈希算法

void TestHashTable()
	{
		HashTable<int, int> ht;//本身是int时，不需要传第3个参数
		ht.Insert(5);
		ht.Insert(15);
		ht.Insert(16);
		ht.Insert(17);
		ht.Insert(25);
		ht.Insert(35);
		ht.Insert(45);
		ht.Insert(55);

		struct StrHash//仿函数
		{
			size_t operator()(const string& s)
			{
				size_t hash = 0;
				for (auto ch : s)
				{
					hash += ch;
				}
				return hash;
			}
		};

HashTable<string, string, StrHash> strht;//整形是有限的数，字符串是无限的，方法1：仿函数
HashTable<string, string> strht;//方法2：模板特化，有特化就用特化
		strht.Insert("sort");
		strht.Insert("insert");
#include "HashTable.h"
#include "unordered_map.h"
#include "unordered_set.h"
int main()
{   //testop();
	//Close::TestHashTable();
	bit::test_unordered_set();
		return 0;
}

开散列哈希桶

HashTable.h：

namespace Open
{
	size_t GetNextPrime(size_t prime)
	{
		static const int PRIMECOUNT = 28;
		static const size_t primeList[PRIMECOUNT] =
		{
			53ul, 97ul, 193ul, 389ul, 769ul,
			1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
			49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
			1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
			50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
			1610612741ul, 3221225473ul, 4294967291ul
		};

		size_t i = 0;
		for (; i < PRIMECOUNT; ++i)
		{
			if (primeList[i] > prime)
				return primeList[i];
		}
		return primeList[i];
	}

	template<class T>
	struct HashLinkNode
	{
		T _t;
		HashLinkNode<T>* _next;

		HashLinkNode(const T& t)
			: _t(t)
			, _next(nullptr)
		{}
	};

	template<class K>
	struct Hash
	{
		size_t operator()(const K& key)
		{
			return key;
		}
	};

	// 特化
	template<>
	struct Hash < string >
	{
		size_t operator()(const string& s)
		{
			size_t hash = 0;
			for (auto ch : s)
			{
				//hash += ch;
				hash = hash * 131 + ch;
			}

			return hash;
		}
	};

//类模板的前置声明，原因：构造迭代器的时候，需要HashTable，但模板没有进行特化，找不到
	template<class K, class T, class KeyOfT, class hash>
	class HashTable;

//对于封装迭代器来说，++会到桶里面的下1个节点，当桶里不再有结点时，需要到下1个有hash映射的位置
	template<class K, class T, class Ref,class Ptr ,class KeyOfT, class hash> 
	struct HashIterator
	{//friend template class HashTable;//错误：友元加反了，不在这里
		typedef HashIterator<K, T, Ref,Ptr,KeyOfT, hash> Self;
		typedef HashLinkNode<T> Node;
		Node* _node;
		HashTable<K, T, KeyOfT, hash>* _pht; 
//指向哈希表的指针：_pht，目的：遍历哈希表时，当1个通遍历完，找到下1个需要遍历的桶
		HashIterator(Node* node, HashTable<K, T, KeyOfT, hash>* pht)
//构造函数:传node、哈希表HashTable；传vectcor复杂,
			:_node(node)
			, _pht(pht)
		{}

		Ref operator*()
		{
			return _node->_t;
		}

		Ptr operator->()
		{
			return &(_node->_t);
		}

		bool operator!=(const Self& s) const //普通对象、const对象都可以调
		{
			return _node != s._node;
		}
		//对于operator++（前置++）来说，加完之后依旧返回的是一个迭代器的指针
		//1. 当前桶还有数据，继续走
		//2. 当前桶没有数据，跳到下一个桶，从第一个开始
		Self operator++()//（哈希表）单向迭代器没有--
		{
			if (_node->_next)
			{
				_node = _node->_next;
			}
			else
			{
				KeyOfT kot;
//一个桶已经走完了，找下一个桶，用当前桶可以算出index位置，需要一个hash表
//算出的index的原来位置，已经全部都走完了，从他的下一个位置开始计算
				size_t index = _pht->HashFunc(kot(_node->_t),_pht->_tables.size());//表的大小_pht->_tables.size()
				++index;
				while (index < _pht->_tables.size())
				{
					if (_pht->_tables[index])
					{
						//这个桶非空，就开始遍历这个桶；表里存的第1个节点的指针
						_node = _pht->_tables[index];
						break;//找到下1个桶了
					}
					else
					{
						++index;
					}
				}
//while结束有可能1.break跳出来的   2.循环走结束了，两种情况区分一下
				if (index == _pht->_tables.size())
				{//没有找到下1个桶
					_node = nullptr;
				}
			}
			return *this;//返回的是自己
		}
	};


	template<class K,class T,class KeyOfT,class hash = Hash<K>>
	class HashTable
	{
		typedef HashLinkNode<T> Node;
		friend struct HashIterator < K, T, T&, T*, KeyOfT, hash > ;
	public:
		typedef HashIterator<K, T, T&,T*,KeyOfT, hash> Iterator;
		typedef HashIterator<K, T, const T&, const T*, KeyOfT, hash> Const_Iterator;
		Iterator Begin()//得到第1个桶里第1个数
		{
			for (size_t i = 0; i < _tables.size(); ++i)
			{//不知道第1个桶在哪里，需要找到第1个桶
				if (_tables[i])//如果是非空则找到了第1个桶
				{//_tables[i]:节点指针，用此构造1个迭代器
					return Iterator(_tables[i],this); //this：指向HashTable的指针
				}
			}

			//如果哈希表中一个数据也没有，那么返回nullptr也是正确的
			return End();
		}
		Iterator End()
		{
			return Iterator(nullptr,this);
		}

		size_t HashFunc(const K& key, size_t n)//计算下标
		{
			hash hf;
			size_t ki = hf(key);//K转为int
			return ki % n;
		}
	
pair<Iterator, bool> Insert(const T& t)
		{	//开散列/哈希桶负载因子控制在1，满了再加，空间利用率更高
			//自己冲突的多，只会影响自己，不影响别人
			KeyOfT kot;
			//控制负载因子 == 1的时候增容
			if (_size == _tables.size())
			{
				size_t newsize = GetNextPrime(_tables.size());//得到素数,有效验证能够提高效率
				vector<Node*> newtables;
				newtables.resize(newsize,nullptr);

				//循环的把每个哈希桶拿过来，重新挂接
				for (size_t i = 0; i < _tables.size(); ++i)
				{
					//旧表中结点直接取下来挂在新表中
					Node* cur = _tables[i];
					while (cur)
					{//选择头插，单链表尾插时，需要找尾
						Node* next = cur->_next;//提前保存cur的next,防止cur被挂到新表，找不到旧表中的next
						size_t index = HashFunc(kot(cur->_t),newtables.size());//重新计算在所开辟的新表中的位置
						cur->_next = newtables[index];//头插
						newtables[index] = cur;
						cur = next;
					}
					_tables[i] = nullptr;//旧表指针指向空
				}
				newtables.swap(_tables);
			}//所有数据都要重新插入1次
			//需要得到下标的位置
			size_t index = HashFunc(kot(t),_tables.size());//kot：把T里面的key提取出来，再转为int
//做map的key要支撑比较大小、做unordered_map的key要支持取模		
//不允许键值冗余，查找t在不在，先找到那个位置，然后在遍历整个哈希桶
			Node* cur = _tables[index];
			while (cur)
			{
				if (kot(cur->_t) == kot(t))
				{//如果存在则返回，不存在则插入
					return make_pair(Iterator(cur,this), false);
				}
				cur = cur->_next;
			}
//哈希表中一开始存储的是NULL，当底下有挂接的时候，存储的将会变为第1个结点的地址
//头插到链表桶里面
			Node* newnode = new Node(t);
			newnode->_next = _tables[index];
			_tables[index] = newnode;
			return make_pair(Iterator(newnode,this), true);
		}

		//Find就是通过Key来寻找的
		Iterator Find(const K& key)
		{
			KeyOfT kot;
			size_t index = HashFunc(key,_tables.size());
			Node* cur = _tables[index];
			//在hash表中找到位置，然后在遍历该位置的整个hash桶
			while (cur)
			{
				if (kot(cur->_t) == key)
				{
					return Iterator(cur,this);
				}
				cur = cur->_next;
			}
			return End();
		}
		bool Earse(const K& key)
		{
			KeyOfT kot;
			size_t index = HashFunc(key, _tables.size());//映射的位置index
			//一个单链表的删除
			Node* prev = nullptr;
			Node* cur = _tables[index];
			//现在hash表中找到位置，然后在遍历该位置的整个hash桶
			while (cur)
			{
				if (kot(cur->_t) == key)
				{
					//进行删除
					if (prev == nullptr)
					{
						//删除的是头
						_tables[index] = cur->_next; //链表挂接的第一个就直接找到了
					}
					else
					{
						prev->_next = cur->_next;
					}
					delete cur;
					return true;//找到了且删除成功
				}
				else
				{
					prev = cur;
					cur = cur->_next;
				}
			}
			return false;//没找到
		}
	private:
		vector<Node*> _tables;//vector里存Node*节点指针![请添加图片描述](https://img-blog.csdnimg.cn/47b8e514e54a4afa8850cc8c39936f04.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBA5Y-I5piv6L-Z6LSn,size_20,color_FFFFFF,t_70,g_se,x_16)

		size_t _size = 0;//有效数据的个数
	};
}

unordered_set.h

#pragma once
#include "HashTable.hpp"

//map和set一样，自身是没有什么东西的，各种操作是在红黑树的基础上实现的
//unordered_set和unordered_map都是在开散列HashTable基础上实现的
namespace bit
{
	template<class K,class hash = Open::Hash<K>> 
	class unordered_set
	{
		struct SetKOfT
		{
			const K& operator()(const K& k)
			{
				return k;
			}
		};
	public:
		typedef typename Open::HashTable<K, K, SetKOfT, hash>::Iterator iterator;//基于开散列Open

		iterator begin()
		{
			return _t.Begin();
		}

		iterator end()
		{
			return _t.End();
		}

		iterator find(const K& key)
		{
			return _t.Find(key);
		}

		bool earse(const K& key)
		{
			return _t.Earse(key);
		}
		pair<iterator, bool> insert(const K& k)
		{
			return _t.Insert(k);
		}

	private:
		Open::HashTable<K, K, SetKOfT, hash> _t;
	};

	void test_unordered_set()
	{
		bit::unordered_set<int> us;
		us.insert(1);
		us.insert(54);
		us.insert(58);
		us.insert(59);
		us.insert(21);
		us.insert(22); 
		us.insert(23);
		us.insert(24);
		for (auto& e : us)
		{
			cout << e << " ";
		}
		cout << endl;

		unordered_set<int>::iterator it = us.find(22);
		cout << *it << endl;

		us.earse(24);
		us.earse(21);
		us.earse(22);
		for (auto& e : us)
		{
			cout << e << " ";
		}
		cout << endl;
	}
}

unordered_map.h

#pragma once 
#include"HashTable.hpp"

namespace bit
{
	template<class K,class V,class hash = Open::Hash<K>>//基于开散列
	class unordered_map
	{
		struct MapKOfT
		{
			const K& operator()(const pair<const K, V>& kv)
			{
				return kv.first;
			}
		};

	public:
		typedef typename Open::HashTable<K, pair<const K, V>, MapKOfT, hash>::Iterator iterator;
		pair<iterator, bool> insert(const pair<const K, V>& kv)
		{
			return _ht.Insert(kv);
		}

		iterator begin()
		{
			return _ht.Begin();
		}

		iterator end()
		{
			return _ht.End();
		}

		iterator find(const K& key)
		{
			return _ht.Find(key);
		}

		bool earse(const K& key)
		{
			return _ht.Earse(key);
		}

		V& operator[](const K& key)
		{
			pair<iterator,bool> ret = insert(make_pair(key, V()));
			return ret.first->second;
		}
	private:
		Open::HashTable<K, pair<const K, V>, MapKOfT,hash> _ht;
	};

	void test_unordered_map()
	{
		bit::unordered_map<int, int> um;
		um.insert(make_pair(1, 1));
		um.insert(make_pair(2, 1));
		um.insert(make_pair(3, 1));

		unordered_map<int,int>::iterator it = um.begin();
		while (it != um.end())
		{
			cout << it->first << ":"<<it->second<<endl;
			++it;
		}

		bit::unordered_map<string,string> dict;
		dict["hash"] = "哈希";
		dict["sort"] = "排序";
	
		for (auto& e : dict)
		{
			cout << e.first << ":" << e.second << endl;
		}

		cout <<endl;

		dict.earse("hash");
		dict.earse("find");
		for (auto& e : dict)
		{
			cout << e.first << ":" << e.second << endl;
		}
	}
}

BitSet.h位图

#pragma once
#include <vector>

namespace bit
{
	template<size_t N>//n 个比特位
	class bitset
	{
	public:
		bitset()
		{//开空间
			_bits.resize((N >> 3) + 1, 0);//右移3位：÷8
		}

		void set(size_t x)
		{
			size_t index = x >> 3;
			size_t num = x % 8;

			_bits[index] |= (1 << num);
		}

		void reset(size_t x)
		{
			size_t index = x >> 3;
			size_t num = x % 8;

			_bits[index] &= (~(1 << num));
		}

		bool test(size_t x)
		{
			size_t index = x >> 3;
			size_t num = x % 8;

			return _bits[index] & (1 << num);
		}
	private:
		std::vector<char> _bits;//vector里存的char
		//std::vector<int> _bits; 
	};

	void test_bitset()
	{
		bitset<100> bs;
		bs.set(10);
		bs.set(17);
		bs.set(80);

		cout << bs.test(10) << endl;
		cout << bs.test(17) << endl;
		cout << bs.test(80) << endl;
		cout << bs.test(81) << endl << endl;

		bs.reset(80);
		bs.set(81);

		cout << bs.test(10) << endl;
		cout << bs.test(17) << endl;
		cout << bs.test(80) << endl;
		cout << bs.test(81) << endl;

		//bitset<-1> bs_max;，开了大致42亿个值的映射
		//bitset<0xffffffff> bs_max;按16进制
	}
}

结果： 1 1 1 0 1 1 0 1

布隆过滤器BloomFilter.h

#include "BitSet.h"
#include <string>

struct StrHash1
{
	size_t operator()(const string& s)
	{
		size_t hash = 0;
		for (auto ch : s)
		{
			//hash += ch;
			hash = hash * 131 + ch;
		}

		return hash;
	}
};

struct StrHash2
{
	size_t operator()(const string& s)
	{
		size_t hash = 0;
		for (auto ch : s)
		{
			hash = 65599 * hash + ch;
			//hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;  
		}
		return hash;
	}
};

struct StrHash3
{
	size_t operator()(const string& s)
	{
		size_t hash = 0;
		size_t magic = 63689;
		for (auto ch : s)
		{
			hash = hash * magic + ch;
			magic *= 378551;
		}
		return hash;

		return hash;
	}
};

template<size_t N, class K = string,
class HashFunc1 = StrHash1,
class HashFunc2 = StrHash2,
class HashFunc3 = StrHash3>//HashFunc仿函数给多，误判的概率越小；有缺省参数；有N个比特位的布隆过滤器
class BloomFilter
{
public:
	BloomFilter()
		:_bs(N*4)//自定义类型：初始化地方=定义地方，N*4的原因：这格局概率得到的公式
		, _n(N*4)//表的大小:n
	{}
	void Set(const K& key)
	{
		HashFunc1 hf1;//定义对象，对象可以像函数一样使用
		size_t i1 = hf1(key) % _n;//映射第1个位置
		_bs.set(i1);//置为1

		HashFunc2 hf2;
		size_t i2 = hf2(key) % _n;
		_bs.set(i2);

		HashFunc3 hf3;
		size_t i3 = hf3(key) % _n;
		_bs.set(i3);
		cout << i1 << "--" << i2 << "--"<<i3<<endl;
	}

	// 一般不支持删除，删除可能存在把其他值也干掉了
	void Reset(const K& key);

	bool Test(const K& key)//判断key是否存在，key可能是结构体、字符串
	{
		HashFunc1 hf1;
		size_t i1 = hf1(key) % _n;
		if (!_bs.test(i1))
		{
			return false;
		}

		HashFunc2 hf2;
		size_t i2 = hf2(key) % _n;
		if (!_bs.test(i2))
		{
			return false;
		}

		HashFunc3 hf3;
		size_t i3 = hf3(key) % _n;
		if (!_bs.test(i3))
		{
			return false;//但凡有1个不在就已经return了
		}
		return true;
	}
private:
	bit::bitset<N> _bs;//里面存的是位图
	size_t _n;
};

void TestBloomFilter()
{
	BloomFilter<10> bf;
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/2528154.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/2528156.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/2528165.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/2528167.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/2528168.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/2528169.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/252816.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/252817.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/252818.html");
	bf.Set("https://www.cnblogs.com/-clq/archive/2012/05/31/252819.html");


	cout << bf.Test("https://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html")<<endl;
	cout << bf.Test("https://www.cnblogs.com/-clq/archive/2012/05/31/2528154.html") << endl;
	cout << bf.Test("https://www.cnblogs.com/-clq/archive/2012/05/31/2528156.html") << endl;
	cout << bf.Test("https://www.cnblogs.com/-clq/archive/2012/05/31/2528165.html") << endl<<endl;

	cout << bf.Test("https://www.cnblogs.com/-clq/archive/2012/05/31/2528166.html") << endl;
	cout << bf.Test("https://www.cnblogs.com/-clq/archive/2012/05/31/2528135.html") << endl;
}

海量数据

给两个文件，分别有100亿个query，我们只有1G内存，两个文件交集?精确算法
1、估算给的数据大概占用多少空间。假设一个query平均是10byte字节，100亿个query大概占用100G
2、哈希切分200份
100G query的文件A：创建200个小文件A0，A1,. . . A199;依次获取文件中每个query，i = Hash(query) % 200；每个query进入算出的Ai文件，i=几进入对应的几号小文件
100Gquery文件B：创建200个小文件 B0，B1… B199；依次获取文件中每个query，i = Hash(query) %200每个query进入算出的.Bi文件
Ai小文件和Bi小文件求交集
Ai读取到一个set中，依次取Bi中的query判断在不在Ai中，在就是交集
如果是平均切分也是可以的，只是跟Ai小文件的交集，可能在[BO，B199],都需要比一遍，不同的字符串在每个文件里都有
如果是哈希切分，A和B中相同query一定进入编号相同的Ai和Bi，只需要按编号比即可。这样效率高。如果有和A0一样的字符串一定在B0，因为一样的query通过一样的哈希函数Hash(query)，最终得到一样的i。

又是这货

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
哈希Hash

文章目录unordered_set961.在长度2N的数组中找出重复N次的元素349.两个数组的交集387.字符串中的第一个唯一字符闭散列方法 -- 线性探测unordered_set#include<iostream>#include<vector>#include<unordered_set>#include<set>#include<time.h>using namespace std;void test_time()..
复制链接

扫一扫