当前位置：首页 > news >正文

[性能分析与优化]伪共享问题（perf + cpp)

news 2025/9/18 11:17:14

伪共享问题的主要来源有两件事

每次cache读入一个数据时实际上读入整个数据行
多个线程可能会读入相同的数据行

由于cache写回内存时也是按照数据行进行写入的，导致了写回内存时并不能做到真正的共享，而是需要等待。

解决方案有两种

将需要并行计算的数据使用alignas等方法隔离在不同的cacheline
使用thread_local进行数据隔离

Perf

由上述分析可知，伪共享主要导致的是Cache Miss，故使用相关事件

perf stat -e cache-misses ./benchmark

用例代码

原代码

#include <iostream>
#include <thread>
#include <vector>struct SharedData {int x;int y;
};
const int epoch = 1000000000;
void incrementX(SharedData &data) {for (int i = 0; i < epoch; ++i) {data.x++;}
}void incrementY(SharedData &data) {for (int i = 0; i < epoch; ++i) {data.y++;}
}int main() {SharedData data{0, 0};std::thread t1(incrementX, std::ref(data));std::thread t2(incrementY, std::ref(data));t1.join();t2.join();std::cout << "x: " << data.x << ", y: " << data.y << std::endl;return 0;
}

threadlocal

#include <iostream>
#include <thread>
#include <vector>struct ThreadData {int x;int y;
};thread_local ThreadData threadData = {0, 0};const int epoch = 1000000000;void incrementX() {for (int i = 0; i < epoch; ++i) {threadData.x++;}
}void incrementY() {for (int i = 0; i < epoch; ++i) {threadData.y++;}
}int main() {int totalX = 0, totalY = 0;std::thread t1([&totalX]() {incrementX();totalX = threadData.x; // 将线程1的结果保存到主线程变量});std::thread t2([&totalY]() {incrementY();totalY = threadData.y; // 将线程2的结果保存到主线程变量});t1.join();t2.join();std::cout << "x: " << totalX << ", y: " << totalY << std::endl;return 0;
}

padding

#include <iostream>
#include <thread>
#include <vector>struct SharedData {int x;char padding[60];int y;
};
const int epoch = 1000000000;
void incrementX(SharedData &data) {for (int i = 0; i < epoch; ++i) {data.x++;}
}void incrementY(SharedData &data) {for (int i = 0; i < epoch; ++i) {data.y++;}
}int main() {SharedData data{0, 0};std::thread t1(incrementX, std::ref(data));std::thread t2(incrementY, std::ref(data));t1.join();t2.join();std::cout << "x: " << data.x << ", y: " << data.y << std::endl;return 0;
}