当前位置：首页 > news >正文

哈希表与unordered_set和unordered_map的实现

news 2025/9/23 21:57:32

一、哈希表

1.1 什么是哈希表

哈希是能将关键字通过映射方式直接映射成存储地址，从而使得查找效率提高到O(1)。例如：

a[10] = {1, 3, 39, 24, 12, 7, 56, 85, 98, 20}

将每个值取模，再将取模得到的值直接作为存储在数组的位置：

a[10] = {20, 1, 12, 3, 24, 85, 56, 7, 98, 39}

这样，如果要查找12，12%10 = 2， a[2] = 12，从而使查找效率达到O(1)。

这只是一个例子，只需要有对应的映射函数，即可。例如，可以将string类型的数据，每个字符的ascii码直接相加，形成一个整型值。

1.2 冲突与解决方法

在进行映射的时候，有可能将不同的关键字映射到同一个位置，这个时候就产生了冲突，例如：

在上述a数组的例子中，如果再插入一个22，那么22和12在进行映射计算后，得到的都是2，这样就产生了冲突，22究竟应该放到哪里呢？这就需要解决冲突的办法：

1.2.1 开放地址法

开放地址法，也叫闭散列法，就是在哈希表数组内部寻找下一个存储的位置。使用了这种方法以后，如果查找的第一个值不是要找的值，就需要按寻找下一个存储位置的方法继续查找，直到找到一个空余位置或者找到要查找的值。因此，需要注意：删除数据后，要在被删除数据的位置做标记，表示该位置是被删除，而不是空余。而寻找下一个位置的方法也有多种，总结为：

线性探测法：即线性的在冲突位置后面一个一个位置查找是否有空余位置。
二次探测法：即在冲突位置后的+1、+2^2、+3^2……等位置进行查找，加完后需要为数字对数组大小进行取模，避免访问越界。直到找到空闲位置。
双重散列：即再使用另一个散列函数，针对冲突的数据进行第二次散列，若还冲突仍可以使用该散列函数继续探测

但是如果数值存储的太多，冲突就会频繁发生，所以通常会定一个值叫做负载因子，即存储的数据数量/总空间数量。

负载因子过大会导致冲突频繁，负载因子过小又会浪费空间，通常取0.7。

那么基于开放地址法中的线性探测法，即可写出一个哈希表来：

template<class K>
struct HashFunc
{size_t operator()(const K& key){return (size_t)key;}
};// 哈希表中支持字符串的操作
template<>
struct HashFunc<string>
{size_t operator()(const string& key){size_t hash = 0;for (auto e : key){hash *= 131;hash += e;}return hash;}
};// 以下采用开放定址法，即线性探测解决冲突
namespace W_open_address
{enum State{EXIST,EMPTY,DELETE};template<class K, class V>struct HashData{pair<K, V> _kv;State _state = EMPTY;};template<class K, class V, class Hash = HashFunc<K>>class HashTable{public:HashTable(){_tables.resize(10);}bool Insert(const pair<K, V>& kv) {if (Find(kv.first)) {return false;}if (_n * 10 / _tables.size() >= 7) {HashTable<K, V, Hash> tmp;tmp._tables.resize(_tables.size() * 2);for (size_t i = 0; i < _tables.size(); i++) {if (_tables[i]._state == EXIST) {tmp.Insert(_tables[i]._kv);}}_tables.swap(tmp._tables);}Hash hs;size_t ky = hs(kv.first);ky = ky % _tables.size();while (_tables[ky]._state == EXIST) {ky++;ky = ky % _tables.size();}_tables[ky]._kv = kv;_tables[ky]._state = EXIST;_n++;return true;}HashData<K, V>* Find(const K& key) {Hash hs;size_t ky = hs(key) % _tables.size();size_t start = (ky - 1) % _tables.size();while (start != ky) {if (_tables[ky]._state == EMPTY) {return nullptr;}if (_tables[ky]._state == EXIST &&_tables[ky]._kv.first == key) {return &_tables[ky];}ky++;ky %= _tables.size();}return nullptr;}bool Erase(const K& key) {HashData<K, V>* hd = Find(key);Hash hs;if (hd) {hd->_state = DELETE;_n--;return true;}return false;}void Print() {for (auto& e : _tables) {if (e._state == DELETE) {cout << "  ";}else {cout << e._kv.second << " ";}}cout << endl;}private:vector<HashData<K, V>> _tables;size_t _n = 0;  // 表中存储数据个数};
}

代码中的HashFunc就是将string等数据类型转化为size_t的函数，针对不同的数据类型可以写不同的HashFunc。

1.2.2 链地址法

链地址法就是将冲突的数据用链表，链接在相同的哈希表的地址上，使映射以后相同地址的数据都存储在同一个链表中，在查找时只需要找到哈希表的位置，再遍历链表即可。

这种方法相对开放地址法更简单，插入的时间复杂度也会更低，在链表过长的时候，也可以将链表转换成红黑树，进一步提高查找的效率。

那么我们就可以根据链地址法再写出一个哈希表：

    template<class T>struct HashBucketNode{T _data;HashBucketNode<T>* _next;HashBucketNode(const T& data):_data(data), _next(nullptr){}};template<class K>struct HashFunc {public:size_t operator()(const K& key) {return (size_t)key;}};template<>struct HashFunc<string> {public:size_t operator()(const string& key) {size_t ky = 0;for (auto& e : key) {ky *= 31;ky += e;}return ky;}};// K 为 T 中key的类型// T 是数据类型便于封装map和set// KeyOfT: 从T中提取key// Hash将key转化为整形，因为哈希函数使用除留余数法template<class K, class T, class KeyOfT, class Hash = HashFunc<K>>class HashTable{//friend struct HBIterator<class K, class V, class KeyOfValue, class HF>;typedef HashBucketNode<T> Node;public:HashTable(){_tables.resize(10, nullptr);}// 哈希桶的销毁~HashTable() {for (size_t i = 0; i < _tables.size(); i++) {Node* cur = _tables[i];while (cur) {Node* tmp = cur->_next;delete cur;_n--;cur = tmp;}_tables[i] = nullptr;}}// 插入值为data的元素，如果data存在则不插入bool Insert(const T& data) {KeyOfT kot;Hash hs;if (Find(kot(data))) return false;if (_n == _tables.size()) {HashTable tmp;tmp._tables.resize(2 * _tables.size(),nullptr);for (size_t i = 0; i < _tables.size(); i++) {Node* cur = _tables[i];while (cur) {Node* tp = cur->_next;//计算cur在新表中的位置size_t hashi = hs(kot(cur->_data)) % tmp._tables.size();cur->_next = tmp._tables[hashi];tmp._tables[hashi] = cur;cur = tp;tmp._n++;}_tables[i] = nullptr;}_tables.swap(tmp._tables);}size_t ky = hs(kot(data)) % _tables.size();Node* tmp = new Node(data);tmp->_next = _tables[ky];_tables[ky] = tmp;_n++;return true;}// 在哈希桶中查找值为key的元素，存在返回true否则返回falsebool Find(const K& key) {Hash hs;KeyOfT kot;size_t ky = hs(key) % _tables.size();Node* cur = _tables[ky];while (cur) {if (kot(cur->_data) == key) {return true;}cur = cur->_next;}return false;}// 哈希桶中删除key的元素，删除成功返回true，否则返回falsebool Erase(const K& key) {if (!Find(key)) return false;Hash hs;KeyOfT kot;size_t ky = hs(key);Node* cur = _tables[ky];if (kot(cur->_data) == key) {delete cur;_tables[ky] = nullptr;_n--;return true;}Node* prev = nullptr;while (cur) {if (kot(cur->_data) == key) {prev->_next = cur->_next;delete cur;_n--;return true;}prev = cur;cur = cur->_next;}return false;}void Print() {for (auto& e : _tables) {Node* tmp = e;while (tmp) {cout << tmp->_data << " ";tmp = tmp->_next;}}cout << endl;}private:vector<Node*> _tables;  // 指针数组size_t _n = 0;			// 表中存储数据个数};

二、封装unordered_set和unordered_map

unordered顾名思义就是没有顺序，因为底层并不是红黑树，而是哈希表，因此，如果数据并不要求有序输出，只要求高效查找，那么就可以使用unordered_set或unordered_map。

2.1 哈希表的迭代器

哈希表的迭代器在实现++功能时，需要用到哈希表，因此需要在迭代器中声明哈希表这个模板类，同时，又需要访问哈希表内的元素，因此，又需要在哈希表内声明迭代器是友元类。最后，还需要对find、erase等函数的返回值进行修改，以便于封装map和set，因此修改基于哈希桶的哈希表如下：

	template<class T>struct HashBucketNode{T _data;HashBucketNode<T>* _next;HashBucketNode(const T& data):_data(data), _next(nullptr){}};// 为了实现简单，在哈希桶的迭代器类中需要用到hashBucket本身，template<class K, class V, class KeyOfValue, class HF>class HashTable;// 注意：因为哈希桶在底层是单链表结构，所以哈希桶的迭代器不需要--操作template <class K, class V, class ref, class ptr, class KeyOfValue, class HF>struct HBIterator{typedef HashTable<K, V, KeyOfValue, HF> HashTable;typedef HashBucketNode<V>* PNode;typedef HBIterator<K, V, ref, ptr, KeyOfValue, HF> Self;HBIterator(PNode pNode = nullptr, HashTable* pHt = nullptr):_pNode(pNode),_pHt(pHt){}HBIterator(PNode pNode = nullptr, const HashTable* pHt = nullptr):_pNode(pNode),_pHt(pHt){}template<class K, class T, class ref, class ptr, class KeyOfValue, class HF>HBIterator(const HBIterator<K, T, ref, ptr, KeyOfValue, HF>& it):_pNode(it._pNode), _pHt(it._pHt){}Self& operator++(){// 当前迭代器所指节点后还有节点时直接取其下一个节点if (_pNode->_next)_pNode = _pNode->_next;else{// 找下一个不空的桶，返回该桶中第一个节点size_t ky = HF()(KeyOfValue()(_pNode->_data)) % _pHt->_tables.size() + 1;for (; ky < _pHt->_tables.size(); ++ky){if (_pNode = _pHt->_tables[ky])break;}if (ky == _pHt->_tables.size()){_pNode = nullptr;}}return *this;}Self operator++(int) {auto tmp = *this;this->operator++();return tmp;}ref operator*() {return _pNode->_data;}ptr operator->() {return &(_pNode->_data);}bool operator==(const Self& it) const {return (it._pNode == _pNode) && (it._pHt == _pHt);}bool operator!=(const Self& it) const {return (it._pNode != _pNode) || (it._pHt != _pHt);}PNode _pNode;             // 当前迭代器关联的节点const HashTable* _pHt;         // 哈希桶--主要是为了找下一个空桶时候方便};template<class K>struct HashFunc {public:size_t operator()(const K& key) {return (size_t)key;}};template<>struct HashFunc<string> {public:size_t operator()(const string& key) {size_t ky = 0;for (auto& e : key) {ky *= 31;ky += e;}return ky;}};// K 为 T 中key的类型// T 可能是键值对，也可能是K// KeyOfT: 从T中提取key// Hash将key转化为整形，因为哈希函数使用除留余数法template<class K, class T, class KeyOfT, class Hash = HashFunc<K>>class HashTable{template <class K, class V, class ref, class ptr, class KeyOfValue, class HF>friend struct HBIterator;typedef HashBucketNode<T> Node;public:typedef HBIterator<K, T, T&, T*, KeyOfT, Hash> iterator;typedef HBIterator<K, T, const T&, const T*, KeyOfT, Hash> const_iterator;HashTable(){_tables.resize(10, nullptr);}// 哈希桶的销毁~HashTable() {for (size_t i = 0; i < _tables.size(); i++) {Node* cur = _tables[i];while (cur) {Node* tmp = cur->_next;delete cur;_n--;cur = tmp;}_tables[i] = nullptr;}}iterator begin() {for (size_t i = 0; i < _tables.size(); i++) {if (_tables[i]) return iterator(_tables[i], this);}return iterator(nullptr, this);}iterator end() {return iterator(nullptr, this);}const_iterator begin() const{for (size_t i = 0; i < _tables.size(); i++) {if (_tables[i]) return const_iterator(_tables[i], this);}return const_iterator(nullptr, this);}const_iterator end() const{return const_iterator(nullptr, this);}size_t size() const{return _n;}bool empty() const{return _n == 0;}size_t Count(const K& key)  {Hash hs;KeyOfT kot;size_t ky = hs(key) % _tables.size();Node* cur = _tables[ky];size_t count = 0;while (cur) {count++;cur = cur->_next;}return count;}size_t BucketCount() {return _tables.size();}size_t BucketSize(const K& key) {return Count(key);}// 插入值为data的元素，如果data存在则不插入pair<iterator, bool> Insert(const T& data) {KeyOfT kot;Hash hs;iterator it = Find(kot(data));if (it._pNode != nullptr) return make_pair(it, false);if (_n == _tables.size()) {HashTable tmp;tmp._tables.resize(2 * _tables.size(),nullptr);for (size_t i = 0; i < _tables.size(); i++) {Node* cur = _tables[i];while (cur) {Node* tp = cur->_next;//计算cur在新表中的位置size_t hashi = hs(kot(cur->_data)) % tmp._tables.size();cur->_next = tmp._tables[hashi];tmp._tables[hashi] = cur;cur = tp;tmp._n++;}_tables[i] = nullptr;}_tables.swap(tmp._tables);}size_t ky = hs(kot(data)) % _tables.size();Node* tmp = new Node(data);tmp->_next = _tables[ky];_tables[ky] = tmp;_n++;return make_pair(iterator(tmp, this), true);}// 在哈希桶中查找值为key的元素，存在返回true否则返回falseiterator Find(const K& key) {Hash hs;KeyOfT kot;size_t ky = hs(key) % _tables.size();Node* cur = _tables[ky];while (cur) {if (kot(cur->_data) == key) {return iterator(cur, this);}cur = cur->_next;}return iterator(nullptr, this);}// 哈希桶中删除key的元素，删除成功返回true，否则返回falseiterator Erase(iterator position) {K key = KeyOfT()(position._pNode->_data);iterator it = Find(key);if (it._pNode == nullptr) return it;Hash hs;KeyOfT kot;size_t ky = hs(key);Node* cur = _tables[ky];if (kot(cur->_data) == key) {delete cur;_tables[ky] = nullptr;_n--;while (ky < _tables.size() && !_tables[ky]) ky++;if (ky == _tables.size()) return iterator(nullptr, this);return iterator(_tables[ky], this);}Node* prev = cur;while (cur) {if (kot(cur->_data) == key) {prev->_next = cur->_next;delete cur;_n--;if (!prev->_next) {ky++;while (ky<_tables.size() && !_tables[ky]) ky++;if (ky == _tables.size()) break;return iterator(_tables[ky], this);}return iterator(prev->_next, this);}prev = cur;cur = cur->_next;}return iterator(nullptr, this);}void Print() {for (auto& e : _tables) {Node* tmp = e;while (tmp) {cout << tmp->_data << " ";tmp = tmp->_next;}}cout << endl;}private:vector<Node*> _tables;  // 指针数组size_t _n = 0;			// 表中存储数据个数};

需要注意的是，在使用的过程中可以会遇到const迭代器和迭代器之间需要转换的情况，这里使用了两种解决办法：

直接将一个迭代器中的数据取出来，利用构造函数构造出另一个迭代器
使用模板类构造函数，如下：

template<class K, class T, class ref, class ptr, class KeyOfValue, class HF>
HBIterator(const HBIterator<K, T, ref, ptr, KeyOfValue, HF>& it):_pNode(it._pNode), _pHt(it._pHt)
{
}

在要使用的模板类前声明模板参数，然后就能利用该模板类进行构造。

经过以上两步，就能对set和map进行封装，但是在实际编写的过程中，封装与对哈希函数的修改是同步进行的，这样在调试的过程中更容易发现问题。

2.2 unordered_set

// unordered_set中存储的是K类型，HF哈希函数类型
// unordered_set在实现时，只需将hashbucket中的接口重新封装即可
template<class K, class HF = hash_bucket::HashFunc<K>>
class unordered_set
{// 通过key获取value的操作struct KeyOfValue{const K& operator()(const K& data){return data;}};typedef hash_bucket::HashTable<K, K, KeyOfValue, HF> HT;
public:typename typedef HT::const_iterator iterator;unordered_set() : _ht(){}iterator begin() const{ return _ht.begin(); }iterator end() const{ return _ht.end(); }// capacitysize_t size()const { return _ht.size(); }bool empty() const{ return _ht.empty(); }///// lookupiterator find(const K& key) { return _ht.Find(key); }size_t count(const K& key) { return _ht.Count(key); }/// modifypair<iterator, bool> insert(const K& value){auto ret = _ht.Insert(value);return make_pair(iterator(ret.first._pNode, ret.first._pHt), ret.second);}iterator erase(iterator position){auto ret = _ht.Erase(position);return iterator(ret._pNode,ret._pHt);}// bucketsize_t bucket_count() { return _ht.BucketCount(); }size_t bucket_size(const K& key) { return _ht.BucketSize(key); }
private:HT _ht;
};

这里keyofvalue与之前使用红黑树进行封装的原理是一样的，避免因为数据类型的不同影响对关键字的读取。

2.3 unordered_map

// unordered_map中存储的是pair<K, V>的键值对，K为key的类型，V为value的类型，HF哈希函数类型
// unordered_map在实现时，只需将hashtable中的接口重新封装即可
template<class K, class V, class HF = hash_bucket::HashFunc<K>>
class unordered_map
{// 通过key获取value的操作struct KeyOfValue{const K& operator()(const pair<K, V>& data){return data.first;}};typedef hash_bucket::HashTable<K, pair<const K, V>, KeyOfValue, HF> HT;
public:typename typedef HT::iterator iterator;typename typedef HT::const_iterator const_iterator;unordered_map() : _ht(){}iterator begin() { return _ht.begin(); }iterator end() { return _ht.end(); }// capacitysize_t size()const { return _ht.size(); }bool empty()const { return _ht.empty(); }///// AcessV& operator[](const K& key){pair<iterator, bool> ret = _ht.Insert(pair<K, V>(key, V()));return ret.first->second;}const V& operator[](const K& key)const {pair<iterator, bool> ret = _ht.Insert(pair<K, V>(key, V()));return ret.first->second;}//// lookupiterator find(const K& key) { return _ht.Find(key); }size_t count(const K& key) { return _ht.Count(key); }/// modifypair<iterator, bool> insert(const pair<K, V>& value){return _ht.Insert(value);}iterator erase(iterator position){return _ht.Erase(position);}// bucketsize_t bucket_count() { return _ht.BucketCount(); }size_t bucket_size(const K& key) { return _ht.BucketSize(key); }
private:HT _ht;
};

查看全文

http://www.dtcms.com/a/242534.html