当前位置：首页 > news >正文

c++进阶之----哈希（桶）

news 来源：原创 2025/6/24 4:15:50

本篇主要讲解哈希桶的实现，线性探测和开放寻址法在下一篇博客中讲解

1.概念

哈希（Hash）是一种将任意长度的输入数据映射到固定长度的输出数据的方法。哈希函数是哈希的核心，它负责将输入数据转换为哈希值。哈希值通常是一个整数，用于快速查找和比较数据。

哈希函数：将输入数据转换为固定长度的输出数据（哈希值）的函数。
哈希值：哈希函数的输出结果。
哈希表：一种基于哈希的数据结构，用于快速查找、插入和删除数据。

2. 负载因子

假设哈希表中已经映射存储了N个值，哈希表的大小为M，那么，负载因子有些地方也翻译为载荷因子/装载因子等，

2.1 负载因子的作用

影响冲突概率：负载因子越高，哈希表中的元素越密集，冲突的概率也越高。
影响查找效率：负载因子越高，查找元素时需要遍历的链表或开放寻址的次数可能越多，查找效率降低。

2.2 负载因子对哈希表性能的影响

负载因子过高：
- 冲突概率增加，查找效率下降。
- 可能需要动态调整桶的数量以降低负载因子。
负载因子过低：
- 桶的利用率不高，浪费内存空间。
- 查找效率可能较高，但内存使用效率低

3.实现

3.1 基本结构

和其他数据结构类似，我们还是先构建其结点，之后再由结点组成我们的哈希桶，这里我们用了链地址法，可以参考下图

像一条拉链一样，我们在构造它时需要以指针数组为底层逻辑，每一个小table都有一个指针“挂钩”，方便我们映射元素并进行存储。__stl_next_prime(0）表示质数表的第一个数，相当于造的时候就给出了一定的空间，但为什么要是质数呢？因为我们要减少哈希冲突，即减少映射在同一个位置上的数据，同时可以提高探测效率，而且当哈希表的大小为质数时，输入数据的步长（如某些模式化的输入）不容易与表的大小产生因数关系，从而减少因数相关的冲突

static const int __stl_num_primes = 28;
static const unsigned long __stl_prime_list[__stl_num_primes] =
{
  53,         97,         193,       389,       769,
  1543,       3079,       6151,      12289,     24593,
  49157,      98317,      196613,    393241,    786433,
  1572869,    3145739,    6291469,   12582917,  25165843,
  50331653,   100663319,  201326611, 402653189, 805306457,
  1610612741, 3221225473, 4294967291
};
inline unsigned long __stl_next_prime(unsigned long n)
{
	const unsigned long* first = __stl_prime_list;
	const unsigned long* last = __stl_prime_list + __stl_num_primes;
	const unsigned long* pos = lower_bound(first, last, n);
	return pos == last ? *(last - 1) : *pos;
}
template<class  k>
struct HashFunc
{
	size_t operator()(const k& key)
	{
		return key;
	}
};
namespace hash_bucket
{
	template<class k, class v>
	struct HashNode
	{
		pair<k, v> _kv;
		HashNode<k, v>* _next;

		HashNode(const pair<k, v>& kv)
			:_kv(kv)
			,_next(nullptr)
		{}
	};

	template<class k, class v,class hash=HashFunc<k> >
	class HashTable
	{
		typedef HashNode<k, v> Node;
	public:
		HashTable(size_t size = __stl_next_prime(0))
			:_tables(size, nullptr)
		{}

		~HashTable()
		{
			for (size_t i = 0; i < _tables.size(); i++)
			{
				Node* cur = _tables[i];
				while (cur)
				{
					Node* next = cur->_next;
					delete cur;
					cur = next;
				}
				//表里面删完了记得把这个表置为空
				_tables[i] = nullptr;
			}
		}

	private:
		vector<Node*> _tables;  //指针数组
		size_t _n = 0;
	};

}

3.2 查找函数

思路很简单，就是先算出要插入元素的映射值，之后让cur定位到这一列，上例中，比如我要找30，那么我算出来映射在8的位置，那就让cur先来到8这里，相当于是‘8’这条链表的头结点，之后像链表一样查找就OK了。

参考代码：

		Node* find(const k& key)
		{
			hash hs;
			size_t hashi = hs(key) % _tables.size();
			Node* cur = _tables[hashi];
			while (cur)
			{
				if (cur->_kv.first == key)
				{
					return cur;   //找到了
				}
				cur = cur->_next;
			}
			return nullptr;
		}

3.3 删除函数

这个思路也和删除链表一样，还是先算映射，比如删30，映射到8这个位置，之后就在“8”这个链表里面去寻找30就好，这里值得注意的是，如果我们要删的是96的话，删了之后“8”就和链表“脱钩”了，所以我们得让“8”这个位置（头结点）与cur->next连上，其余就参考链表删除代码就好了

bool erase(const k& key)
{
	//跟删除链表的方法一样

	hash hs;
	size_t hashi = hs(key) % _tables.size();
	Node* prev = nullptr;
	Node* cur = _tables[hashi];
	while (cur)
	{
		if (cur->_kv.first == key)
		{
			if (prev == nullptr)
			{
				_tables[hashi] = cur->_next;
			}
			else
			{
				prev->_next = cur->_next;
			}
			--_n;
			delete cur;
			return true;
		}
		prev = cur;
		cur = cur->_next;
	}
	return false;
}

3.4 插入函数

插入的逻辑也比较好理解，还是先判断哈希表满没满，满了扩容方法及注释在代码里面，这里不在赘述，之后就是映射，插入，跟链表的插入差不多。

bool insert(const pair<k, v>& kv)
{
	if (find(kv.first))
	{
		return false;        //有相同的键了，不能再插入了
	}
	hash hs;
	// 负载因子到1，再扩容
	if (_n == _tables.size())        //满了要扩容
	{
		 也可以，但是扩容新开辟节点，释放旧节点，有点浪费
		//HashTable<k, v> newHT(__stl_next_prime(_tables.size() + 1));     
        //这里不同于之前链表之类的二倍扩容，我们通常采用质数来扩容
		//for (size_t i = 0; i < _tables.size(); i++)
		//{
		//	Node* cur = _tables[i];
		//	while (cur)
		//	{
		//		newHT.insert(kv.first);
		//		cur = cur->_next;
		//	}
		//}
		//_tables.swap(newHT._tables);    //我们本意是想给this的哈希表扩容，只不过是借用另一 
                                            个哈希表来转移数据罢了，搞好了就要过河拆桥了
		vector<Node*> newtables(__stl_next_prime(_tables.size() + 1), nullptr);
		for (size_t i = 0;i < _tables.size(); i++)
		{
			Node* cur = _tables[i];      //可以认为tables[i]就是图中横着的那些，cur的使命就 
                                           是把可以映射到这里的位置的数一个一个的挂上去
			while (cur)
			{
				// 旧表的节点挪动下来
				// 插入到映射的新表位置
				Node* next = cur->_next;
				size_t hashi = hs(cur->_kv.first) % newtables.size();
				cur->_next = newtables[hashi];
				newtables[hashi] = cur;
				cur = next;
			}
			_tables[i] = nullptr;
		}
		_tables.swap(newtables);
	}
	size_t hashi = hs(kv.first) % _tables.size();
	Node* newnode = new Node(kv);

	//头插
	newnode->_next = _tables[hashi];
	_tables[hashi] = newnode;
	++_n;
	return true;
}

4.代码汇总

这是.h文件

#pragma once
#include<utility>
#include<vector>
#include <string>
using namespace std;
//扩容用
static const int __stl_num_primes = 28;
static const unsigned long __stl_prime_list[__stl_num_primes] =
{
  53,         97,         193,       389,       769,
  1543,       3079,       6151,      12289,     24593,
  49157,      98317,      196613,    393241,    786433,
  1572869,    3145739,    6291469,   12582917,  25165843,
  50331653,   100663319,  201326611, 402653189, 805306457,
  1610612741, 3221225473, 4294967291
};
inline unsigned long __stl_next_prime(unsigned long n)
{
	const unsigned long* first = __stl_prime_list;
	const unsigned long* last = __stl_prime_list + __stl_num_primes;
	const unsigned long* pos = lower_bound(first, last, n);
	return pos == last ? *(last - 1) : *pos;
}


template<class  k>
struct HashFunc
{
	size_t operator()(const k& key)
	{
		return key;
	}
};
// 特化
template<>
struct HashFunc<string> {
	size_t operator()(const string& key)
	{
		size_t hashi = 0;
		for (auto ch : key)
		{
			hashi *= 131;
			hashi += ch;
		}
		return hashi;
	}
};
namespace hash_bucket
{
	template<class k, class v>
	struct HashNode
	{
		pair<k, v> _kv;
		HashNode<k, v>* _next;

		HashNode(const pair<k, v>& kv)
			:_kv(kv)
			,_next(nullptr)
		{}
	};

	template<class k, class v,class hash=HashFunc<k> >
	class HashTable
	{
		typedef HashNode<k, v> Node;
	public:
		HashTable(size_t size = __stl_next_prime(0))
			:_tables(size, nullptr)
		{}

		~HashTable()
		{
			for (size_t i = 0; i < _tables.size(); i++)
			{
				Node* cur = _tables[i];
				while (cur)
				{
					Node* next = cur->_next;
					delete cur;
					cur = next;
				}
				//表里面删完了记得把这个表置为空
				_tables[i] = nullptr;
			}
		}

		bool insert(const pair<k, v>& kv)
		{
			if (find(kv.first))
			{
				return false;        //有相同的键了，不能再插入了
			}
			hash hs;
			// 负载因子到1，再扩容
			if (_n == _tables.size())        //满了要扩容
			{
				 也可以，但是扩容新开辟节点，释放旧节点，有点浪费
				//HashTable<k, v> newHT(__stl_next_prime(_tables.size() + 1));     //这里不同于之前链表之类的二倍扩容，我们通常采用质数来扩容
				//for (size_t i = 0; i < _tables.size(); i++)
				//{
				//	Node* cur = _tables[i];
				//	while (cur)
				//	{
				//		newHT.insert(kv.first);
				//		cur = cur->_next;
				//	}
				//}
				//_tables.swap(newHT._tables);    //我们本意是想给this的哈希表扩容，只不过是借用另一个哈希表来转移数据罢了，搞好了就要过河拆桥了
				vector<Node*> newtables(__stl_next_prime(_tables.size() + 1), nullptr);
				for (size_t i = 0;i < _tables.size(); i++)
				{
					Node* cur = _tables[i];      //可以认为tables[i]就是图中横着的那些，cur的使命就是把可以映射到这里的位置的数一个一个的挂上去
					while (cur)
					{
						// 旧表的节点挪动下来
						// 插入到映射的新表位置
						Node* next = cur->_next;
						size_t hashi = hs(cur->_kv.first) % newtables.size();
						cur->_next = newtables[hashi];
						newtables[hashi] = cur;
						cur = next;
					}
					_tables[i] = nullptr;
				}
				_tables.swap(newtables);
			}
			size_t hashi = hs(kv.first) % _tables.size();
			Node* newnode = new Node(kv);

			//头插
			newnode->_next = _tables[hashi];
			_tables[hashi] = newnode;
			++_n;
			return true;
		}

		Node* find(const k& key)
		{
			hash hs;
			size_t hashi = hs(key) % _tables.size();
			Node* cur = _tables[hashi];
			while (cur)
			{
				if (cur->_kv.first == key)
				{
					return cur;   //找到了
				}
				cur = cur->_next;
			}
			return nullptr;
		}

		bool erase(const k& key)
		{
			//跟删除链表的方法一样

			hash hs;
			size_t hashi = hs(key) % _tables.size();
			Node* prev = nullptr;
			Node* cur = _tables[hashi];
			while (cur)
			{
				if (cur->_kv.first == key)
				{
					if (prev == nullptr)
					{
						_tables[hashi] = cur->_next;
					}
					else
					{
						prev->_next = cur->_next;
					}
					--_n;
					delete cur;
					return true;
				}
				prev = cur;
				cur = cur->_next;
			}
			return false;
		}

	private:
		vector<Node*> _tables;  //指针数组
		size_t _n = 0;
	};

}

这是.cpp测试文件

#include"hashtable.h"
#include<string>
#include<iostream>
using namespace std;
namespace hash_bucket
{
	void test1()
	{
		int a[] = { 19, 30, 5, 36, 13, 20, 21, 12, 58, 111 };
		HashTable<int, int> ht;
		for (auto e : a)
		{
			ht.insert({e,e});
		}

		ht.erase(5);
		ht.erase(58);
		ht.erase(111);
		HashTable<string, string> dict;
		dict.insert({ "sort", "排序" });
		dict.insert({ "string", "字符串" });
	}
}

int main()
{
	hash_bucket::test1();

    HashFunc<string> hf;
	cout << hf("bcad") << endl;
	cout << hf("aadd") << endl;
	cout << hf("abcd") << endl;
	return 0;
}