当前位置：首页 > news >正文

【C++进阶】哈希

news 2025/11/14 16:32:39

【C++进阶】哈希

1. unordered系列关联式容器

1.1 为什么需要unordered容器

在C++98中，STL提供了基于红黑树的关联式容器（map、set等），查询效率为O(log₂N)。但当数据量非常大时，这种对数级别的时间复杂度仍然不够理想。C++11引入了unordered系列容器，通过哈希表实现，平均情况下查询效率达到O(1)。

特性	树形容器	哈希容器
底层结构	红黑树	哈希表
查询效率	O(logN)	平均O(1)
元素顺序	有序	无序
内存使用	相对较少	相对较多

1.2 unordered_map详解

1.2.1 基本特性

template <class Key, class T,class Hash = hash<Key>,class Pred = equal_to<Key>,class Alloc = allocator<pair<const Key, T>>>
class unordered_map;

核心特点：

存储键值对，支持快速查找
元素无序存储
平均时间复杂度为O(1)
支持operator[]操作

1.2.2 使用示例

#include <unordered_map>
#include <string>
#include <iostream>void TestUnorderedMap() {std::unordered_map<std::string, int> ageMap;// 插入元素ageMap["Alice"] = 25;ageMap["Bob"] = 30;ageMap.insert({"Charlie", 28});// 访问元素std::cout << "Alice's age: " << ageMap["Alice"] << std::endl;// 遍历元素（无序）for (const auto& pair : ageMap) {std::cout << pair.first << ": " << pair.second << std::endl;}// 查找元素auto it = ageMap.find("Bob");if (it != ageMap.end()) {std::cout << "Found Bob, age: " << it->second << std::endl;}// 桶操作std::cout << "Bucket count: " << ageMap.bucket_count() << std::endl;std::cout << "Load factor: " << ageMap.load_factor() << std::endl;
}

1.3 unordered_set详解

#include <unordered_set>void TestUnorderedSet() {std::unordered_set<int> numSet;// 插入元素numSet.insert(1);numSet.insert(2);numSet.insert(3);numSet.insert(2);  // 重复元素不会被插入// 查找元素if (numSet.find(2) != numSet.end()) {std::cout << "2 exists in set" << std::endl;}// 遍历元素for (int num : numSet) {std::cout << num << " ";}std::cout << std::endl;
}

1.4 实际应用案例

1.4.1 统计元素出现次数

class Solution {
public:int repeatedNTimes(vector<int>& A) {unordered_map<int, int> countMap;// 统计每个元素出现次数for (int num : A) {countMap[num]++;}// 找出出现N次的元素int n = A.size() / 2;for (const auto& pair : countMap) {if (pair.second == n) {return pair.first;}}return -1;}
};

1.4.2 求两个数组的交集

class Solution {
public:vector<int> intersection(vector<int>& nums1, vector<int>& nums2) {unordered_set<int> set1(nums1.begin(), nums1.end());unordered_set<int> set2(nums2.begin(), nums2.end());vector<int> result;for (int num : set1) {if (set2.count(num)) {result.push_back(num);}}return result;}
};

2. 哈希表的底层原理

2.1 哈希概念

理想搜索：不经过比较，一次直接从表中得到要搜索的元素。

哈希方法：通过哈希函数使元素的存储位置与关键码之间建立映射关系。

// 简单哈希函数示例
size_t HashFunc(int key, size_t capacity) {return key % capacity;  // 除留余数法
}

2.2 哈希冲突

当不同关键字通过相同哈希函数计算出相同的哈希地址时，称为哈希冲突。

// 示例：哈希表容量为10
Hash(4) = 4 % 10 = 4
Hash(14) = 14 % 10 = 4  // 冲突！
Hash(24) = 24 % 10 = 4  // 冲突！

2.3 常见哈希函数

2.3.1 直接定址法

Hash(Key) = A * Key + B

适用场景：关键字分布连续且范围小

2.3.2 除留余数法

Hash(Key) = Key % P  // P通常为质数

适用场景：最常用的哈希函数

2.3.3 字符串哈希函数

// BKDR哈希算法
size_t BKDRHash(const std::string& str) {size_t hash = 0;size_t seed = 131; // 31, 131, 1313, 13131, 131313等for (char c : str) {hash = hash * seed + c;}return hash;
}

3. 哈希冲突解决方案

3.1 闭散列（开放定址法）

3.1.1 线性探测

当发生冲突时，依次向后查找空位置。

enum State { EMPTY, EXIST, DELETE };template<class K, class V>
struct HashElem {std::pair<K, V> _val;State _state;
};template<class K, class V>
class HashTable {
public:HashTable(size_t capacity = 10) : _size(0) {_table.resize(capacity);for (auto& elem : _table) {elem._state = EMPTY;}}bool Insert(const std::pair<K, V>& val) {// 检查扩容if (_size * 10 / _table.size() >= 7) {_CheckCapacity();}size_t hashAddr = _HashFunc(val.first);size_t startAddr = hashAddr;// 线性探测查找空位置while (_table[hashAddr]._state == EXIST) {if (_table[hashAddr]._val.first == val.first) {return false;  // 元素已存在}hashAddr++;if (hashAddr == _table.size()) {hashAddr = 0;}if (hashAddr == startAddr) {return false;  // 表已满}}// 插入元素_table[hashAddr]._val = val;_table[hashAddr]._state = EXIST;_size++;return true;}private:size_t _HashFunc(const K& key) {return key % _table.size();}std::vector<HashElem<K, V>> _table;size_t _size;
};

3.1.2 二次探测

避免线性探测的数据堆积问题：

// 查找下一个位置的公式
H_i = (H_0 + i²) % m
// 或
H_i = (H_0 - i²) % m

3.2 开散列（链地址法）

3.2.1 基本概念

将哈希值相同的元素放在同一个桶中，用链表连接。

template<class V>
struct HashNode {HashNode(const V& data) : _data(data), _next(nullptr) {}V _data;HashNode<V>* _next;
};template<class V>
class HashBucket {
public:HashBucket(size_t capacity = 10) : _size(0) {_table.resize(_GetNextPrime(capacity), nullptr);}bool Insert(const V& data) {// 检查扩容if (_size == _table.size()) {_CheckCapacity();}size_t bucketNo = _HashFunc(data);HashNode<V>* cur = _table[bucketNo];// 检查元素是否已存在while (cur) {if (cur->_data == data) {return false;}cur = cur->_next;}// 头插新节点HashNode<V>* newNode = new HashNode<V>(data);newNode->_next = _table[bucketNo];_table[bucketNo] = newNode;_size++;return true;}private:size_t _HashFunc(const V& data) {return data % _table.size();}std::vector<HashNode<V>*> _table;size_t _size;
};

3.2.2 扩容机制

void _CheckCapacity() {size_t newCapacity = _GetNextPrime(_table.size() * 2);std::vector<HashNode<V>*> newTable(newCapacity, nullptr);// 重新哈希所有元素for (size_t i = 0; i < _table.size(); ++i) {HashNode<V>* cur = _table[i];while (cur) {HashNode<V>* next = cur->_next;size_t newBucketNo = _HashFunc(cur->_data, newCapacity);// 头插到新表中cur->_next = newTable[newBucketNo];newTable[newBucketNo] = cur;cur = next;}_table[i] = nullptr;}_table.swap(newTable);
}

4. 哈希的应用

4.1 位图（Bitmap）

4.1.1 位图概念

用每一位来存放某种状态，适用于海量数据且数据不重复的场景。

class BitSet {
public:BitSet(size_t bitCount) : _bitCount(bitCount) {// 计算需要多少个int（每个int32位）_bits.resize((bitCount >> 5) + 1, 0);}// 设置某一位为1void Set(size_t pos) {if (pos >= _bitCount) return;size_t index = pos >> 5;      // pos / 32size_t offset = pos & 0x1F;   // pos % 32_bits[index] |= (1 << offset);}// 设置某一位为0void Reset(size_t pos) {if (pos >= _bitCount) return;size_t index = pos >> 5;size_t offset = pos & 0x1F;_bits[index] &= ~(1 << offset);}// 测试某一位是否为1bool Test(size_t pos) const {if (pos >= _bitCount) return false;size_t index = pos >> 5;size_t offset = pos & 0x1F;return _bits[index] & (1 << offset);}private:std::vector<int> _bits;size_t _bitCount;
};

4.1.2 位图应用

问题：给40亿个不重复的无符号整数，如何快速判断一个数是否在其中？

解决方案：

// 40亿个数需要40亿个位，约500MB内存
BitSet bitSet(4000000000UL);// 插入所有数
for (uint32_t num : numbers) {bitSet.Set(num);
}// 查询
if (bitSet.Test(queryNum)) {std::cout << "存在" << std::endl;
} else {std::cout << "不存在" << std::endl;
}

4.2 布隆过滤器（Bloom Filter）

4.2.1 基本概念

布隆过滤器是一种概率型数据结构，用于判断元素"一定不存在"或"可能存在"。

// 多个哈希函数
struct HashFunc1 {size_t operator()(const std::string& str) {size_t hash = 0;for (char c : str) {hash = hash * 131 + c;}return hash;}
};struct HashFunc2 {size_t operator()(const std::string& str) {size_t hash = 5381;for (char c : str) {hash = (hash << 5) + hash + c;}return hash;}
};struct HashFunc3 {size_t operator()(const std::string& str) {size_t hash = 0;for (size_t i = 0; i < str.size(); ++i) {if ((i & 1) == 0) {hash ^= ((hash << 7) ^ str[i] ^ (hash >> 3));} else {hash ^= (~((hash << 11) ^ str[i] ^ (hash >> 5)));}}return hash;}
};template<size_t N, class K = std::string,class Hash1 = HashFunc1,class Hash2 = HashFunc2, class Hash3 = HashFunc3>
class BloomFilter {
public:void Set(const K& key) {size_t hash1 = Hash1()(key) % (N * 5);size_t hash2 = Hash2()(key) % (N * 5);size_t hash3 = Hash3()(key) % (N * 5);_bitSet.Set(hash1);_bitSet.Set(hash2);_bitSet.Set(hash3);}bool Test(const K& key) {size_t hash1 = Hash1()(key) % (N * 5);if (!_bitSet.Test(hash1)) return false;size_t hash2 = Hash2()(key) % (N * 5);if (!_bitSet.Test(hash2)) return false;size_t hash3 = Hash3()(key) % (N * 5);if (!_bitSet.Test(hash3)) return false;return true;  // 可能存在}private:BitSet _bitSet{N * 5};  // 适当扩大位图大小减少误判率
};

4.2.2 布隆过滤器特性

优点：

空间效率极高
查询时间O(k)，k为哈希函数个数
不需要存储元素本身

缺点：

存在误判率
不支持删除操作
无法获取元素本身

5. 海量数据处理面试题

5.1 哈希切割

问题：100G的log文件存储IP地址，找到出现次数最多的IP。

解决方案：

// 思路：将大文件切割成小文件，分别统计
// 1. 使用哈希函数将IP分配到不同小文件
// 2. 对每个小文件统计IP频率
// 3. 合并结果找到最大值std::unordered_map<std::string, int> ipCount;
for (const auto& ip : allIps) {size_t fileIndex = std::hash<std::string>{}(ip) % fileCount;// 将ip写入对应的小文件
}// 分别处理每个小文件
for (int i = 0; i < fileCount; ++i) {std::unordered_map<std::string, int> localCount;// 读取小文件并统计for (const auto& ip : smallFileIps[i]) {localCount[ip]++;}// 合并到全局统计
}

5.2 位图应用

5.2.1 找出只出现一次的整数

// 使用两个位图表示状态：
// 00:出现0次, 01:出现1次, 10:出现多次
class FindOnce {
public:void Add(int num) {if (!_bit1.Test(num) && !_bit2.Test(num)) {// 00 -> 01_bit1.Set(num);} else if (_bit1.Test(num) && !_bit2.Test(num)) {// 01 -> 10_bit1.Reset(num);_bit2.Set(num);}// 10 -> 10 保持不变}bool IsOnce(int num) {return _bit1.Test(num) && !_bit2.Test(num);}private:BitSet _bit1{INT_MAX};  // 第一位BitSet _bit2{INT_MAX};  // 第二位
};

5.2.2 找两个文件的交集

// 1. 使用位图存储第一个文件的所有整数
// 2. 遍历第二个文件，检查是否在位图中
// 3. 为了去重，找到后重置位图对应位BitSet bitSet(UINT32_MAX);
// 读取第一个文件设置位图
// 读取第二个文件检查交集

5.3 布隆过滤器应用

5.3.1 近似算法求交集

// 使用布隆过滤器近似求两个大文件交集
BloomFilter<1000000> bloomFilter;// 将第一个文件的query加入布隆过滤器
for (const auto& query : file1Queries) {bloomFilter.Set(query);
}// 检查第二个文件的query
std::vector<std::string> intersection;
for (const auto& query : file2Queries) {if (bloomFilter.Test(query)) {intersection.push_back(query);  // 可能存在误判}
}

5.3.2 支持删除的布隆过滤器

// 使用计数器代替位图
class CountingBloomFilter {
public:void Set(const std::string& key) {size_t h1 = Hash1()(key) % _size;size_t h2 = Hash2()(key) % _size;size_t h3 = Hash3()(key) % _size;_counters[h1]++;_counters[h2]++;_counters[h3]++;}void Reset(const std::string& key) {size_t h1 = Hash1()(key) % _size;size_t h2 = Hash2()(key) % _size;size_t h3 = Hash3()(key) % _size;if (_counters[h1] > 0) _counters[h1]--;if (_counters[h2] > 0) _counters[h2]--;if (_counters[h3] > 0) _counters[h3]--;}private:std::vector<int> _counters;
};

6. 总结

6.1 技术选型建议

场景	推荐技术	理由
需要快速查找	unordered_map/set	平均O(1)时间复杂度
元素需要有序	map/set	基于红黑树有序存储
海量数据判重	位图	空间效率极高
字符串等复杂类型判重	布隆过滤器	支持任意类型，空间效率高
精确统计	哈希表	无误差
近似统计	布隆过滤器	允许一定误差，空间效率高