用HashMap实现STL map
在STL中map是用RB Tree(也就是红黑树)来完成的。本文尝试用HashMap来实现map。
问题产生
本文主要是完成以下interface:
#ifndef HASHMAP_HPP
#define HASHMAP_HPP
#include <vector>
#include <list>
#include <utility>
#include <iostream>
#define kHASHSIZE 1024
// forward declearation
template <typename KEY, typename VALUE>
class HashMap;
template <typename KEY, typename VALUE>
class Node {
public:
KEY getKey() const;
VALUE getValue() const;
private:
friend class HashMap<KEY, VALUE>;
Node() : next(NULL), key(), value() {}
Node(const KEY &k, const VALUE &v) : key(k), value(v), next(NULL) {}
// cascade deletion
~Node() {
if (next != NULL) delete next;
}
KEY key;
VALUE value;
Node *next;
};
template <typename KEY, typename VALUE>
class HashMap {
public:
/**
* defalut constructor
*/
HashMap();
/**
* constructor using a vector of pairs
* @param pairs a vector contains all the key-value pairs
* @note duplicate keys in the vector leads to undefined behavior
*/
HashMap(const std::vector<std::pair<KEY, VALUE> > &t_pairs);
/**
* constructor using a list of pairs
* @param pairs a list contains all the key-value pairs
* @note duplicate keys in the list leads to undefined behavior
*/
HashMap(const std::list<std::pair<KEY, VALUE> > &t_pairs);
/**
* copy constructor
* @param t_another another HashMap
*/
HashMap(const HashMap &t_another);
/**
* destructor
*/
~HashMap();
/**
* assignment operator overloaded
* @param t_another another HashMap
*/
HashMap &operator=(const HashMap &t_another);
/**
* get current size
* @return the size of this map
*/
int size(void) const;
/**
* check the map is empty or not
* @return if empty, true will be return
*/
bool empty(void) const;
/**
* get current size
* @param t_key key, the key
* @param t_value value, the value
* @note duplicate keys in the vector leads to a replace behavior
*/
void insert(const KEY &t_key, const VALUE &t_value);
/**
* find if a key is in the map
* @return if the map has the key, return true
*/
bool has(const KEY &t_key) const;
/**
* remove a pair from the map
* @return if success, return true
*/
bool erase(const KEY &t_key);
/**
* remove all the paris in the map
*/
void clear();
/**
* get all the keys
* @return a vector of keys
*/
std::vector<KEY> keys(void) const;
/**
* get all the values
* @return a vector of values
*/
std::vector<VALUE> values(void) const;
/**
* get all the pairs
* @return a vector of pairs
*/
std::vector<std::pair<KEY, VALUE> > items(void) const;
/**
* access a value using key-index
* @param t_key a key
* @return the refernce of the value
* @note if the key is not in the map, a new pair will
be created implicitly using the given key
*/
VALUE &operator[](const KEY &t_key);
private:
// hash function
// inline unsigned long long HashValue(const KEY & t_key) const {
// return reinterpret_cast<unsigned long long>(key);
// }
unsigned int HashValue(const KEY &t_key) const;
inline Node<KEY, VALUE> *find(const KEY &key) {
unsigned long long hv = HashValue(key);
int p = hv % kHASHSIZE;
for (Node<KEY, VALUE> *ptr = m_heads[p]; ptr->next; ptr = ptr->next)
if (ptr->next->key == key) return ptr->next;
return NULL;
}
std::vector<Node<KEY, VALUE> *> m_heads;
int m_size;
};
#include "HashMap.cc"
#endif // HASHMAP_HPP
并通过以下测试代码:
#include <algorithm>
#include <iostream>
#include <list>
#include <string>
#include <vector>
#include "HashMap.hpp"
template <typename KEY, typename VALUE>
struct Functor {
void operator()(const std::pair<KEY, VALUE>& item) {
std::cout << "{\"key\":\"" << item.first << "\",";
std::cout << "\"value\":\"" << item.second << "\"}";
std::cout << std::endl;
}
};
template <typename KEY, typename VALUE>
void print_items(const HashMap<KEY, VALUE>& t_my_map) {
std::vector<std::pair<KEY, VALUE> > items = t_my_map.items();
Functor<KEY, VALUE> functor;
// promise the order does not matter
std::sort(items.begin(), items.end());
std::cout << "empty:" << (t_my_map.empty() ? "true" : "false") << " ";
std::cout << "size:" << t_my_map.size() << std::endl;
std::for_each(items.begin(), items.end(), functor);
}
int main() {
std::vector<std::pair<std::string, int> > data;
// data.push_back(std::pair<std::string, int>("eden",123));
// data.push_back(std::pair<std::string, int>("zion",123));
// data.push_back(std::pair<std::string, int>("matrix",123));
std::string temp_str;
int temp_int;
while (std::cin >> temp_str >> temp_int) {
data.push_back(std::pair<std::string, int>(temp_str, temp_int));
}
// testing constructors
std::cout << "testing constructors" << std::endl;
if (true) {
// default constructor
HashMap<std::string, int> map0;
print_items(map0);
// construct from vector
std::cout << "constructing from vector: " << std::endl;
HashMap<std::string, int> map1(data);
print_items(map1);
// construct from list
std::list<std::pair<std::string, int> > list_data(data.begin(), data.end());
HashMap<std::string, int> map2(list_data);
std::cout << "constructing from list: " << std::endl;
print_items(map2);
// copy constructor
HashMap<std::string, int> map_copy(map2);
std::cout << "copy constructing: " << std::endl;
print_items(map_copy);
}
std::cout << std::endl;
// testing copy assignment operator
std::cout << "testing copy assignment operator" << std::endl;
if (true) {
// testing normal assignment
std::cout << "testing normal assignment:" << std::endl;
HashMap<std::string, int> map1(data);
HashMap<std::string, int> map2(data);
print_items(map2);
// testing assign itself
map2 = map2;
print_items(map2);
// testing cascade assigment
std::cout << "testing cascade assigment" << std::endl;
map1 = map2 = map1.operator=(map2);
print_items(map2);
}
std::cout << std::endl;
// testing modify functions
std::cout << "testing modify functions" << std::endl;
if (true) {
// testing insert
std::cout << "testing insert" << std::endl;
HashMap<std::string, int> map1(data);
for (int i = 0; i < data.size(); i++) {
map1.insert(data[i].first, data[i].second);
}
// testing erase
print_items(map1);
std::cout << "testing erase" << std::endl;
for (int i = 0; i < data.size() / 2; i++) {
std::cout << "erasing " << data[i].first << ": ";
std::cout << (map1.erase(data[i].first) ? "true" : "false");
std::cout << std::endl;
}
print_items(map1);
// test has
std::cout << "testing has:" << std::endl;
for (int i = 0; i < data.size(); i++) {
std::cout << "has entry " << data[i].first << "? ";
std::cout << (map1.has(data[i].first) ? "true" : "false");
std::cout << std::endl;
}
}
std::cout << std::endl;
// testing operator []
std::cout << "testing operator []" << std::endl;
if (true) {
HashMap<std::string, int> map1;
for (int i = 0; i < data.size(); i++) {
map1[data[i].first] = data[i].second;
}
for (int i = 0; i < data.size(); i++) {
map1[data[i].first] = 0;
}
print_items(map1);
}
std::cout << std::endl;
// testing keys and values
std::cout << "testing keys and values" << std::endl;
if (true) {
HashMap<std::string, int> map1(data);
std::vector<std::string> keys = map1.keys();
std::vector<int> values = map1.values();
std::sort(keys.begin(), keys.end());
std::sort(values.begin(), values.end());
for (int i = 0; i < keys.size(); i++) {
std::cout << "key" << i << " " << keys[i] << std::endl;
}
for (int i = 0; i < values.size(); i++) {
std::cout << "value" << i << " " << values[i] << std::endl;
}
}
std::cout << std::endl;
// testing clear
std::cout << "testing clear" << std::endl;
if (true) {
HashMap<std::string, int> map1(data);
map1.clear();
print_items(map1);
}
std::cout << std::endl;
return 0;
}
背景知识
哈希算法将任意长度的二进制值映射为固定长度的较小二进制值,这个小的二进制值称为哈希值。哈希值是一段数据唯一且极其紧凑的数值表示形式。如果散列一段明文而且哪怕只更改该段落的一个字母,随后的哈希都将产生不同的值。要找到散列为同一个值的两个不同的输入,在计算上来说基本上是不可能的。
我的理解是哈希值就是一段数据的特征值,用来特化一段数据。
一个key(string)通过hash function可以对应到一个index,然后把相应的value放入这个index里面。
所以问题的关键在于:如何计算哈希值?
哈希值计算
以下给出几种计算哈希值的算法,以及他们的效率。
unsigned int SDBMHash(char *str) {
unsigned int hash = 0;
while (*str) {
// equivalent to: hash = 65599*hash + (*str++);
hash = (*str++) + (hash << 6) + (hash << 16) - hash;
}
return (hash & 0x7FFFFFFF);
}
// RS Hash Function
unsigned int RSHash(char *str) {
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
while (*str) {
hash = hash * a + (*str++);
a *= b;
}
return (hash & 0x7FFFFFFF);
}
// JS Hash Function
unsigned int JSHash(char *str) {
unsigned int hash = 1315423911;
while (*str) {
hash ^= ((hash << 5) + (*str++) + (hash >> 2));
}
return (hash & 0x7FFFFFFF);
}
// P. J. Weinberger Hash Function
unsigned int PJWHash(char *str) {
unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4);
unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8);
unsigned int HighBits = (unsigned int)(0xFFFFFFFF)
<< (BitsInUnignedInt - OneEighth);
unsigned int hash = 0;
unsigned int test = 0;
while (*str) {
hash = (hash << OneEighth) + (*str++);
if ((test = hash & HighBits) != 0) {
hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return (hash & 0x7FFFFFFF);
}
// ELF Hash Function
unsigned int ELFHash(char *str) {
unsigned int hash = 0;
unsigned int x = 0;
while (*str) {
hash = (hash << 4) + (*str++);
if ((x = hash & 0xF0000000L) != 0) {
hash ^= (x >> 24);
hash &= ~x;
}
}
return (hash & 0x7FFFFFFF);
}
// BKDR Hash Function
unsigned int BKDRHash(char *str) {
unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
unsigned int hash = 0;
while (*str) {
hash = hash * seed + (*str++);
}
return (hash & 0x7FFFFFFF);
}
// DJB Hash Function
unsigned int DJBHash(char *str) {
unsigned int hash = 5381;
while (*str) {
hash += (hash << 5) + (*str++);
}
return (hash & 0x7FFFFFFF);
}
// AP Hash Function
unsigned int APHash(char *str) {
unsigned int hash = 0;
int i;
for (i = 0; *str; i++) {
if ((i & 1) == 0) {
hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));
} else {
hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));
}
}
return (hash & 0x7FFFFFFF);
}
问题解决
我采用的是第一种哈希值的算法。
//
// Hashmap.cpp
// Hash
//
// Created by 颜泽鑫 on 6/4/16.
// Copyright © 2016 颜泽鑫. All rights reserved.
//
#include "HashMap.hpp"
template <typename KEY, typename VALUE>
KEY Node<KEY, VALUE>::getKey() const {
return key;
}
template <typename KEY, typename VALUE>
VALUE Node<KEY, VALUE>::getValue() const {
return value;
}
template <typename KEY, typename VALUE>
HashMap<KEY, VALUE>::HashMap() {
m_size = 0;
for (int i = 0; i != kHASHSIZE; i++) {
m_heads.push_back(new Node<KEY, VALUE>("", 0));
}
}
template <typename KEY, typename VALUE>
HashMap<KEY, VALUE>::HashMap(
const std::vector<std::pair<KEY, VALUE> > &t_pairs) {
for (int i = 0; i != kHASHSIZE; i++) {
m_heads.push_back(new Node<KEY, VALUE>("", 0));
}
m_size = 0;
for (auto pair : t_pairs) {
insert(pair.first, pair.second);
}
}
template <typename KEY, typename VALUE>
HashMap<KEY, VALUE>::HashMap(const std::list<std::pair<KEY, VALUE> > &t_pairs) {
for (int i = 0; i != kHASHSIZE; i++) {
m_heads.push_back(new Node<KEY, VALUE>("", 0));
}
m_size = 0;
for (auto pair : t_pairs) {
insert(pair.first, pair.second);
}
}
template <typename KEY, typename VALUE>
HashMap<KEY, VALUE>::HashMap(const HashMap &t_another) {
for (int i = 0; i != kHASHSIZE; i++) {
m_heads.push_back(new Node<KEY, VALUE>("", 0));
}
m_size = 0;
std::vector<std::pair<KEY, VALUE> > temp = t_another.items();
for (int i = 0; i != temp.size(); i++) {
insert(temp[i].first, temp[i].second);
}
}
template <typename KEY, typename VALUE>
HashMap<KEY, VALUE>::~HashMap() {
for (int i = 0; i != kHASHSIZE; i++) {
delete m_heads[i];
}
}
template <typename KEY, typename VALUE>
HashMap<KEY, VALUE> &HashMap<KEY, VALUE>::operator=(
const HashMap<KEY, VALUE> &t_another) {
if (this == &t_another) {
return *this;
} else {
clear();
m_size = 0;
std::vector<std::pair<KEY, VALUE> > temp = t_another.items();
for (auto pair : temp) {
insert(pair.first, pair.second);
}
}
return *this;
}
template <typename KEY, typename VALUE>
int HashMap<KEY, VALUE>::size() const {
return m_size;
}
template <typename KEY, typename VALUE>
bool HashMap<KEY, VALUE>::empty() const {
return m_size == 0;
}
template <typename KEY, typename VALUE>
void HashMap<KEY, VALUE>::insert(const KEY &t_key, const VALUE &t_value) {
Node<KEY, VALUE> *temp = find(t_key);
if (temp == NULL) {
int p = HashValue(t_key) % kHASHSIZE;
temp = m_heads[p];
while (temp->next != nullptr) {
temp = temp->next;
}
temp->next = new Node<KEY, VALUE>(t_key, t_value);
m_size++;
} else {
temp->value = t_value;
}
}
template <typename KEY, typename VALUE>
bool HashMap<KEY, VALUE>::has(const KEY &t_key) const {
unsigned long long hv = HashValue(t_key);
int p = hv % kHASHSIZE;
for (Node<KEY, VALUE> *ptr = m_heads[p]; ptr->next; ptr = ptr->next) {
if (ptr->next->key == t_key) {
return true;
}
}
return false;
}
template <typename KEY, typename VALUE>
bool HashMap<KEY, VALUE>::erase(const KEY &t_key) {
Node<KEY, VALUE> *temp = find(t_key);
if (temp == NULL) {
return false;
} else {
if (temp->next == NULL) {
int p = HashValue(t_key) % kHASHSIZE;
Node<KEY, VALUE> *del = m_heads[p]->next;
delete del;
m_heads[p]->next = NULL;
} else {
temp->key = temp->next->key;
temp->value = temp->next->value;
if (temp->next->next == NULL) {
Node<KEY, VALUE> *del = temp;
temp->next = NULL;
delete del;
} else {
Node<KEY, VALUE> *del = temp->next;
temp->next = temp->next->next;
delete del;
}
}
m_size--;
return true;
}
}
template <typename KEY, typename VALUE>
void HashMap<KEY, VALUE>::clear() {
for (int i = 0; i != kHASHSIZE; i++) {
Node<KEY, VALUE> *temp = m_heads[i];
if (temp->next != nullptr) {
Node<KEY, VALUE> *del = temp->next;
temp->next = nullptr;
delete del;
}
}
m_size = 0;
}
template <typename KEY, typename VALUE>
std::vector<KEY> HashMap<KEY, VALUE>::keys(void) const {
std::vector<KEY> ans;
for (int i = 0; i != kHASHSIZE; i++) {
Node<KEY, VALUE> *temp = m_heads[i];
if (temp->next == nullptr) {
continue;
} else {
temp = temp->next;
while (temp != nullptr) {
ans.push_back(temp->key);
temp = temp->next;
}
}
}
return ans;
}
template <typename KEY, typename VALUE>
std::vector<VALUE> HashMap<KEY, VALUE>::values(void) const {
std::vector<VALUE> ans;
for (int i = 0; i != kHASHSIZE; i++) {
Node<KEY, VALUE> *temp = m_heads[i];
if (temp->next == nullptr) {
continue;
} else {
temp = temp->next;
while (temp != nullptr) {
ans.push_back(temp->value);
temp = temp->next;
}
}
}
return ans;
}
template <typename KEY, typename VALUE>
std::vector<std::pair<KEY, VALUE> > HashMap<KEY, VALUE>::items(void) const {
std::vector<std::pair<KEY, VALUE> > ans;
for (int i = 0; i != kHASHSIZE; i++) {
Node<KEY, VALUE> *temp = m_heads[i];
if (temp->next == nullptr) {
continue;
} else {
temp = temp->next;
while (temp != NULL) {
ans.push_back(make_pair(temp->key, temp->value));
temp = temp->next;
}
}
}
return ans;
}
template <typename KEY, typename VALUE>
VALUE &HashMap<KEY, VALUE>::operator[](const KEY &t_key) {
if (has(t_key)) {
return find(t_key)->value;
} else {
insert(t_key, 0);
return find(t_key)->value;
}
}
template <typename KEY, typename VALUE>
unsigned int HashMap<KEY, VALUE>::HashValue(const KEY &t_key) const {
unsigned int hash = 0;
int i = 0;
while (t_key[i] != '\0') {
// equivalent to: hash = 65599*hash + (*str++);
hash = t_key[i++] + (hash << 6) + (hash << 16) - hash;
}
return (hash & 0x7FFFFFFF);
}