【cuda学习日记】5.2 共享内存数据分布
原文需要用nvprof去检查共享内存的事务,受限于nvprof不能使用。
5.2.1 方形共享内存
声明共享内存:
shared int tile[N][N];
因为是方形的内存块,用一个二维线程块访问,2种方法去访问:
tile[threadIdx.y][threadIdx.x]
tile[threadIdx.x][threadIdx.y]
核函数有两个简单操作:
·将全局线程索引按行主序写入到一个二维共享内存数组中
·从共享内存中按行主序读取这些值并将它们存储到全局内存中
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>
#define BDIMX 32
#define BDIMY 32
__global__ void warmup(int *out){
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
// smem store
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
// smem load
out[idx] = tile[threadIdx.y][threadIdx.x];
}
__global__ void setRowReadRow(int *out){
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
// smem store
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
// smem load
out[idx] = tile[threadIdx.y][threadIdx.x];
}
__global__ void setColReadCol(int *out){
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
// smem store
tile[threadIdx.x][threadIdx.y] = idx;
__syncthreads();
// smem load
out[idx] = tile[threadIdx.x][threadIdx.y];
}
int main(int argc, char** argv){
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("device %d: %s \n", dev, deviceprop.name);
std::cout << "Compute Capability: " << deviceprop.major << "." << deviceprop.minor << std::endl;
dim3 block(BDIMX, BDIMY);
dim3 grid (1,1); //only 1 block
int nElem = BDIMX * BDIMX;
int nBytes = nElem * sizeof(int);
int *d_A;
cudaMalloc((int**) &d_A, nBytes);
Timer timer;
timer.start();
warmup<<<grid,block>>>(d_A);
cudaDeviceSynchronize();
timer.stop();
float elapsedTime = timer.elapsedms();
printf("warmup <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
timer.start();
setRowReadRow<<<grid,block>>>(d_A);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
printf("setRowReadRow <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
timer.start();
setColReadCol<<<grid,block>>>(d_A);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
printf("setColReadCol <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
cudaFree(d_A);
cudaDeviceReset();
return 0;
}
nvcc checkSmemSquare.cu -Xptxas -v -o checkSmemSquare.exe
checkSmemSquare.cu
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z13setColReadColPi' for 'sm_52'
ptxas info : Function properties for _Z13setColReadColPi
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 6 registers, used 1 barriers, 4096 bytes smem, 328 bytes cmem[0]
ptxas info : Compiling entry function '_Z13setRowReadRowPi' for 'sm_52'
ptxas info : Function properties for _Z13setRowReadRowPi
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 6 registers, used 1 barriers, 4096 bytes smem, 328 bytes cmem[0]
ptxas info : Compiling entry function '_Z6warmupPi' for 'sm_52'
ptxas info : Function properties for _Z6warmupPi
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 6 registers, used 1 barriers, 4096 bytes smem, 328 bytes cmem[0]
tmpxft_00004be0_00000000-10_checkSmemSquare.cudafe1.cpp
Creating library checkSmemSquare.lib and object checkSmemSquare.exp
尝试通过NCU查看shared memory transactions
ncu --metrics smsp__sass_inst_executed_op_shared,smsp__sass_inst_executed_op_shared_ld,smsp__sass_inst_executed_op_shared_st checkSmemSquare.exe
没看出来啥区别
setRowReadRow(int *) (1, 1, 1)x(32, 32, 1), Context 1, Stream 7, Device 0, CC 8.9
Section: Command line profiler metrics
----------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
----------------------------------------- ----------- ------------
smsp__sass_inst_executed_op_shared.avg inst 0.12
smsp__sass_inst_executed_op_shared.max inst 16
smsp__sass_inst_executed_op_shared.min inst 0
smsp__sass_inst_executed_op_shared.sum inst 64
smsp__sass_inst_executed_op_shared_ld.avg inst 0.06
smsp__sass_inst_executed_op_shared_ld.max inst 8
smsp__sass_inst_executed_op_shared_ld.min inst 0
smsp__sass_inst_executed_op_shared_ld.sum inst 32
smsp__sass_inst_executed_op_shared_st.avg inst 0.06
smsp__sass_inst_executed_op_shared_st.max inst 8
smsp__sass_inst_executed_op_shared_st.min inst 0
smsp__sass_inst_executed_op_shared_st.sum inst 32
----------------------------------------- ----------- ------------
setColReadCol(int *) (1, 1, 1)x(32, 32, 1), Context 1, Stream 7, Device 0, CC 8.9
Section: Command line profiler metrics
----------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
----------------------------------------- ----------- ------------
smsp__sass_inst_executed_op_shared.avg inst 0.12
smsp__sass_inst_executed_op_shared.max inst 16
smsp__sass_inst_executed_op_shared.min inst 0
smsp__sass_inst_executed_op_shared.sum inst 64
smsp__sass_inst_executed_op_shared_ld.avg inst 0.06
smsp__sass_inst_executed_op_shared_ld.max inst 8
smsp__sass_inst_executed_op_shared_ld.min inst 0
smsp__sass_inst_executed_op_shared_ld.sum inst 32
smsp__sass_inst_executed_op_shared_st.avg inst 0.06
smsp__sass_inst_executed_op_shared_st.max inst 8
smsp__sass_inst_executed_op_shared_st.min inst 0
smsp__sass_inst_executed_op_shared_st.sum inst 32
----------------------------------------- ----------- ------------
5.2.2 按行主序写和按列主序读
__global__ void setRowReadCol(int *out){
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = blockIdx.y * blockDim.x + blockIdx.x;
// smem store
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
// smem load
out[idx] = tile[threadIdx.x][threadIdx.y];
}
5.2.3 动态共享内存
核函数不知道需要声明多少大小的SMEM时,可以用动态声明
__global__ void setRowReadColDyn(int *out){
//dynamic shared mem
extern __shared__ int tile[];
unsigned int row_idx = blockIdx.y * blockDim.x + threadIdx.x;
unsigned int col_idx = blockIdx.x * blockDim.y + threadIdx.y;
// smem store
tile[row_idx] = row_idx;
__syncthreads();
// smem load
out[row_idx] = tile[col_idx];
}
在调用核函数的时候需要在<<>>>的第三个参数中传递memory大小
setRowReadColDyn<<<grid,block, BDIMX * BDIMY * sizeof(int)>>>(d_A);
5.2.4 填充静态声明的共享内存
填充数组是避免存储体冲突的一种方法。填充静态声明的共享内存很简单。
#define IPAD 1
__global__ void setRowReadColPad(int *out){
__shared__ int tile[BDIMY][BDIMX + IPAD];
unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
// smem store
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
// smem load
out[idx] = tile[threadIdx.x][threadIdx.y];
}
nsys profile --stats=true .\checkSmemSquare.exe
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- -------- -------- -------- -------- ----------- -----------------------
23.4 1728 1 1728.0 1728.0 1728 1728 0.0 setColReadCol(int *)
18.6 1376 1 1376.0 1376.0 1376 1376 0.0 setRowReadCol(int *)
17.7 1312 1 1312.0 1312.0 1312 1312 0.0 warmup(int *)
13.4 992 1 992.0 992.0 992 992 0.0 setRowReadColDyn(int *)
13.4 992 1 992.0 992.0 992 992 0.0 setRowReadColPad(int *)
13.4 992 1 992.0 992.0 992 992 0.0 setRowReadRow(int *)