CUDA 调试器 sanitizer,检测数据竞争,竞争条件 race condition
有数据竞争的代码 race.cu:
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>// 有明显数据竞争的内核
__global__ void raceConditionKernel(int *data, int N) {int tid = threadIdx.x + blockIdx.x * blockDim.x;// 数据竞争:多个线程同时写入 data[0]if (tid < N) {data[0] += data[tid]; // 所有线程都竞争写入 data[0]}
}// 共享内存数据竞争
__global__ void sharedMemoryRaceKernel(int *output) {__shared__ int shared_var;// 数据竞争:多个线程同时初始化if (threadIdx.x < 10) {shared_var = threadIdx.x; // 多个线程竞争写入}__syncthreads();if (threadIdx.x == 0) {*output = shared_var;}
}// 更复杂的数据竞争
__global__ void complexRaceKernel(int *data, int N) {int tid = threadIdx.x + blockIdx.x * blockDim.x;__shared__ int temp[256];temp[threadIdx.x] = tid;__syncthreads();// 数据竞争:多个线程写入 data[tid % 10]if (tid < N) {int target_index = tid % 10; // 只有10个不同的索引data[target_index] += temp[threadIdx.x % 256];}
}int main() {const int N = 1000;int *d_data, *h_data;// 分配内存cudaMalloc(&d_data, N * sizeof(int));h_data = (int*)malloc(N * sizeof(int));// 初始化数据for (int i = 0; i < N; i++) {h_data[i] = i + 1;}cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);printf("Running kernels with race conditions...\n");// 运行有数据竞争的内核dim3 blocks(10);dim3 threads(100);// 内核1:全局内存竞争printf("1. Global memory race condition:\n");raceConditionKernel<<<blocks, threads>>>(d_data, N);cudaDeviceSynchronize();// 内核2:共享内存竞争printf("2. Shared memory race condition:\n");int *d_output;cudaMalloc(&d_output, sizeof(int));sharedMemoryRaceKernel<<<1, 32>>>(d_output);cudaDeviceSynchronize();// 内核3:复杂竞争printf("3. Complex race condition:\n");complexRaceKernel<<<blocks, threads>>>(d_data, N);cudaDeviceSynchronize();// 读取结果cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);printf("First 10 results: ");for (int i = 0; i < 10; i++) {printf("%d ", h_data[i]);}printf("\n");// 清理cudaFree(d_data);cudaFree(d_output);free(h_data);printf("Program completed.\n");return 0;
}
命令:
nvcc -g -G race.cu
compute-sanitizer --tool racecheck ./a.out
试了几次,只能检测到第二种数据竞争,不知道为什么