当前位置: 首页 > news >正文

【cuda学习日记】5.2 共享内存数据分布

原文需要用nvprof去检查共享内存的事务,受限于nvprof不能使用。

5.2.1 方形共享内存
声明共享内存:
shared int tile[N][N];
因为是方形的内存块,用一个二维线程块访问,2种方法去访问:
tile[threadIdx.y][threadIdx.x]
tile[threadIdx.x][threadIdx.y]

核函数有两个简单操作:
·将全局线程索引按行主序写入到一个二维共享内存数组中
·从共享内存中按行主序读取这些值并将它们存储到全局内存中

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>


#define BDIMX 32
#define BDIMY 32


__global__ void warmup(int *out){
    __shared__ int tile[BDIMY][BDIMX];
    unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
    // smem store
    tile[threadIdx.y][threadIdx.x] = idx;

    __syncthreads();
    // smem load
    out[idx] = tile[threadIdx.y][threadIdx.x];
}

__global__ void setRowReadRow(int *out){
    __shared__ int tile[BDIMY][BDIMX];
    unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
    // smem store
    tile[threadIdx.y][threadIdx.x] = idx;

    __syncthreads();
    // smem load
    out[idx] = tile[threadIdx.y][threadIdx.x];
}

__global__ void setColReadCol(int *out){
    __shared__ int tile[BDIMY][BDIMX];
    unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
    // smem store
    tile[threadIdx.x][threadIdx.y] = idx;

    __syncthreads();
    // smem load
    out[idx] = tile[threadIdx.x][threadIdx.y];
}


int main(int argc, char** argv){
    int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("device %d: %s \n", dev, deviceprop.name);
    std::cout << "Compute Capability: " << deviceprop.major << "." << deviceprop.minor << std::endl;


    dim3 block(BDIMX, BDIMY);
    dim3 grid (1,1); //only 1 block

    int nElem = BDIMX * BDIMX;
    int nBytes = nElem * sizeof(int);

    int *d_A;

    cudaMalloc((int**) &d_A, nBytes);

    Timer timer;
    timer.start();
    warmup<<<grid,block>>>(d_A);
    cudaDeviceSynchronize();
    timer.stop();
    float elapsedTime = timer.elapsedms();
    printf("warmup <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);

    
    timer.start();
    setRowReadRow<<<grid,block>>>(d_A);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("setRowReadRow <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);

    timer.start();
    setColReadCol<<<grid,block>>>(d_A);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("setColReadCol <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);

    cudaFree(d_A);

    cudaDeviceReset();
    return 0;

}

nvcc checkSmemSquare.cu -Xptxas -v -o checkSmemSquare.exe

checkSmemSquare.cu
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z13setColReadColPi' for 'sm_52'
ptxas info    : Function properties for _Z13setColReadColPi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 6 registers, used 1 barriers, 4096 bytes smem, 328 bytes cmem[0]
ptxas info    : Compiling entry function '_Z13setRowReadRowPi' for 'sm_52'
ptxas info    : Function properties for _Z13setRowReadRowPi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 6 registers, used 1 barriers, 4096 bytes smem, 328 bytes cmem[0]
ptxas info    : Compiling entry function '_Z6warmupPi' for 'sm_52'
ptxas info    : Function properties for _Z6warmupPi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 6 registers, used 1 barriers, 4096 bytes smem, 328 bytes cmem[0]
tmpxft_00004be0_00000000-10_checkSmemSquare.cudafe1.cpp
   Creating library checkSmemSquare.lib and object checkSmemSquare.exp

尝试通过NCU查看shared memory transactions
ncu --metrics smsp__sass_inst_executed_op_shared,smsp__sass_inst_executed_op_shared_ld,smsp__sass_inst_executed_op_shared_st checkSmemSquare.exe
没看出来啥区别

setRowReadRow(int *) (1, 1, 1)x(32, 32, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: Command line profiler metrics
    ----------------------------------------- ----------- ------------
    Metric Name                               Metric Unit Metric Value
    ----------------------------------------- ----------- ------------
    smsp__sass_inst_executed_op_shared.avg           inst         0.12
    smsp__sass_inst_executed_op_shared.max           inst           16
    smsp__sass_inst_executed_op_shared.min           inst            0
    smsp__sass_inst_executed_op_shared.sum           inst           64
    smsp__sass_inst_executed_op_shared_ld.avg        inst         0.06
    smsp__sass_inst_executed_op_shared_ld.max        inst            8
    smsp__sass_inst_executed_op_shared_ld.min        inst            0
    smsp__sass_inst_executed_op_shared_ld.sum        inst           32
    smsp__sass_inst_executed_op_shared_st.avg        inst         0.06
    smsp__sass_inst_executed_op_shared_st.max        inst            8
    smsp__sass_inst_executed_op_shared_st.min        inst            0
    smsp__sass_inst_executed_op_shared_st.sum        inst           32
    ----------------------------------------- ----------- ------------

  setColReadCol(int *) (1, 1, 1)x(32, 32, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: Command line profiler metrics
    ----------------------------------------- ----------- ------------
    Metric Name                               Metric Unit Metric Value
    ----------------------------------------- ----------- ------------
    smsp__sass_inst_executed_op_shared.avg           inst         0.12
    smsp__sass_inst_executed_op_shared.max           inst           16
    smsp__sass_inst_executed_op_shared.min           inst            0
    smsp__sass_inst_executed_op_shared.sum           inst           64
    smsp__sass_inst_executed_op_shared_ld.avg        inst         0.06
    smsp__sass_inst_executed_op_shared_ld.max        inst            8
    smsp__sass_inst_executed_op_shared_ld.min        inst            0
    smsp__sass_inst_executed_op_shared_ld.sum        inst           32
    smsp__sass_inst_executed_op_shared_st.avg        inst         0.06
    smsp__sass_inst_executed_op_shared_st.max        inst            8
    smsp__sass_inst_executed_op_shared_st.min        inst            0
    smsp__sass_inst_executed_op_shared_st.sum        inst           32
    ----------------------------------------- ----------- ------------

5.2.2 按行主序写和按列主序读

__global__ void setRowReadCol(int *out){
    __shared__ int tile[BDIMY][BDIMX];
    unsigned int idx = blockIdx.y * blockDim.x + blockIdx.x;
    // smem store
    tile[threadIdx.y][threadIdx.x] = idx;

    __syncthreads();
    // smem load
    out[idx] = tile[threadIdx.x][threadIdx.y];
}

5.2.3 动态共享内存
核函数不知道需要声明多少大小的SMEM时,可以用动态声明

__global__ void setRowReadColDyn(int *out){
    //dynamic shared mem
    extern __shared__ int tile[];
    unsigned int row_idx = blockIdx.y * blockDim.x + threadIdx.x;
    unsigned int col_idx = blockIdx.x * blockDim.y + threadIdx.y;

    // smem store
    tile[row_idx] = row_idx;

    __syncthreads();
    // smem load
    out[row_idx] = tile[col_idx];
}

在调用核函数的时候需要在<<>>>的第三个参数中传递memory大小

setRowReadColDyn<<<grid,block, BDIMX * BDIMY * sizeof(int)>>>(d_A);

5.2.4 填充静态声明的共享内存
填充数组是避免存储体冲突的一种方法。填充静态声明的共享内存很简单。

#define IPAD 1
__global__ void setRowReadColPad(int *out){
    __shared__ int tile[BDIMY][BDIMX + IPAD];
    unsigned int idx = blockIdx.y * blockDim.x + threadIdx.x;
    // smem store
    tile[threadIdx.y][threadIdx.x] = idx;

    __syncthreads();
    // smem load
    out[idx] = tile[threadIdx.x][threadIdx.y];
}

nsys profile --stats=true .\checkSmemSquare.exe

 Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)           Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  -----------------------
     23.4             1728          1    1728.0    1728.0      1728      1728          0.0  setColReadCol(int *)
     18.6             1376          1    1376.0    1376.0      1376      1376          0.0  setRowReadCol(int *)
     17.7             1312          1    1312.0    1312.0      1312      1312          0.0  warmup(int *)
     13.4              992          1     992.0     992.0       992       992          0.0  setRowReadColDyn(int *)
     13.4              992          1     992.0     992.0       992       992          0.0  setRowReadColPad(int *)
     13.4              992          1     992.0     992.0       992       992          0.0  setRowReadRow(int *)  

相关文章:

  • 哔哩哔哩IT私塾python爬虫视频教程中的项目文件
  • python绘制cox列线图及绘制指南
  • Halcon 学习之路 set_grayval 算子
  • c++stl——容器
  • C++Qt学习笔记——实现一个串口通信界面
  • Debian安装C语言环境
  • DeepSeek开源:FlashMLA深度解析:Hopper架构上的大模型推理革命
  • 重大更新!锂电池剩余寿命预测新增 CALCE 数据集
  • 硬件基础(3):三极管(3):三极管作为开关的时候为什么设置其工作在截止区和饱和区
  • 达梦数据库中jdbc接口的大批量插入数据的写法推荐
  • 评估自动驾驶(AD)策略性能的关键指标
  • 数字化转型数据自动采集统计分析发那科(FANUC)数据采集
  • Cuppa CMS v1.0 任意文件读取(CVE-2022-25401)
  • 过滤器 二、过滤器详解
  • VScode在windows10上使用clang-format
  • or-tools编译命令自用备注
  • Linux命令入门
  • 星座-从入门到精通
  • 18.6 大语言模型可解释性解密:打开AI黑箱的关键技术
  • 【补阙拾遗】排序之冒泡、插入、选择排序
  • 贵州仁怀通报“正新鸡排鸡腿里全是蛆”:已对同类产品封存送检
  • 俄代表团:16日上午将继续“等候乌代表团”
  • 特朗普再提“接管”加沙,要将其变为“自由区”
  • 汤加附近海域发生6.4级地震
  • 马上评丨火车穿村而过多人被撞身亡,亡羊补牢慢不得
  • 青海规范旅游包车行为:不得引导外省籍旅游包车违规驻地运营