当前位置: 首页 > news >正文

【cuda学习日记】5.3 减少全局内存访问

在第3章讨论过的并行归约问题

5.3.1 使用共享内存进行并行归约
reduceGmem – 使用全局内存作为基准
reduceSmem – 使用共享内存

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>

#define DIM 128

int recursiveReduce(int *data, int const size){
    if (size == 1) return data[0];
    int const stride = size /2;
    for (int i = 0; i < stride; i ++){
        data[i] += data[i + stride];
    }
    return recursiveReduce( data, stride);
}

__global__ void warmup( int *g_idata, int *g_odata, unsigned int n){
    unsigned int tid  = threadIdx.x;
    int *idata = g_idata + blockIdx.x * blockDim.x;

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;
    
    if (blockDim.x >= 1024 &&  tid < 512) idata[tid] += idata[tid+ 512];
    __syncthreads();
    if (blockDim.x >= 512 &&  tid < 256) idata[tid] += idata[tid+ 256];
    __syncthreads();
    if (blockDim.x >= 256 &&  tid < 128) idata[tid] += idata[tid+ 128];
    __syncthreads();
    if (blockDim.x >= 128 &&  tid < 64) idata[tid] += idata[tid+ 64];
    __syncthreads();

    if (tid < 32){
        volatile int *vmem  = idata;
        vmem[tid] += vmem[tid + 32];
        vmem[tid] += vmem[tid + 16];
        vmem[tid] += vmem[tid +  8];
        vmem[tid] += vmem[tid +  4];
        vmem[tid] += vmem[tid +  2];
        vmem[tid] += vmem[tid +  1];
    }

    if  (tid == 0){ g_odata[blockIdx.x] = idata[0];}
}

__global__ void reduceGmem( int *g_idata, int *g_odata, unsigned int n){
    unsigned int tid  = threadIdx.x;
    int *idata = g_idata + blockIdx.x * blockDim.x;

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;
    
    if (blockDim.x >= 1024 &&  tid < 512) idata[tid] += idata[tid+ 512];
    __syncthreads();
    if (blockDim.x >= 512 &&  tid < 256) idata[tid] += idata[tid+ 256];
    __syncthreads();
    if (blockDim.x >= 256 &&  tid < 128) idata[tid] += idata[tid+ 128];
    __syncthreads();
    if (blockDim.x >= 128 &&  tid < 64) idata[tid] += idata[tid+ 64];
    __syncthreads();

    if (tid < 32){
        volatile int *vmem  = idata;
        vmem[tid] += vmem[tid + 32];
        vmem[tid] += vmem[tid + 16];
        vmem[tid] += vmem[tid +  8];
        vmem[tid] += vmem[tid +  4];
        vmem[tid] += vmem[tid +  2];
        vmem[tid] += vmem[tid +  1];
    }

    if  (tid == 0){ g_odata[blockIdx.x] = idata[0];}
}

__global__ void reduceSmem(int *g_idata, int *g_odata, unsigned int n){
    __shared__ int smem[DIM];

    unsigned int tid  = threadIdx.x;
    // convert global data pointer to local pointer
    int *idata = g_idata + blockIdx.x * blockDim.x;

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    //set to smem by each threads
    smem[tid] = idata[tid];
    __syncthreads();

    if (blockDim.x >= 1024 &&  tid < 512) smem[tid] += smem[tid+ 512];
    __syncthreads();
    if (blockDim.x >= 512 &&  tid < 256) smem[tid] += smem[tid+ 256];
    __syncthreads();
    if (blockDim.x >= 256 &&  tid < 128) smem[tid] += smem[tid+ 128];
    __syncthreads();
    if (blockDim.x >= 128 &&  tid < 64) smem[tid] += smem[tid+ 64];
    __syncthreads();

    if (tid < 32){
        volatile int *vsmem  = smem;
        vsmem[tid] += vsmem[tid + 32];
        vsmem[tid] += vsmem[tid + 16];
        vsmem[tid] += vsmem[tid +  8];
        vsmem[tid] += vsmem[tid +  4];
        vsmem[tid] += vsmem[tid +  2];
        vsmem[tid] += vsmem[tid +  1];
    }

    if  (tid == 0){ g_odata[blockIdx.x] = smem[0];}
}

int main(int argc , char **argv)
{
    printf("%s starting\n", argv[0]);

    int dev = 0;
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("Using Device %d : %s\n", dev, deviceprop.name);
    
    int size = 1 << 24;
    int blocksize = 512;
    
    if (argc > 1){
        blocksize = atoi(argv[1]);
    }

    dim3 block(DIM, 1);  // 1d
    dim3 grid ((size + block.x - 1) / block.x, 1);
    
    size_t nBytes = size  * sizeof(int);
    int * h_idata = (int*) malloc(nBytes);
    int * h_odata = (int*) malloc( grid.x * sizeof(int));  //you duoshao ge block
    int * temp = (int*) malloc(nBytes);

    //initial the array
    for (int i = 0 ; i < size;i++){
        h_idata[i] = (int)(rand() & 0xff);
    }

    int sum = 0;
    for (int i = 0 ; i < size;i++){
        sum += h_idata[i];
    }
    printf("sum value is : %d\n", sum);

    memcpy(temp, h_idata, nBytes);

    int gpu_sum = 0;

    int *d_idata = NULL;
    int *d_odata = NULL;

    cudaMalloc((void**)&d_idata, nBytes);
    cudaMalloc((void**)&d_odata, grid.x * sizeof(int));

    
    //cpu sum
    Timer timer;
    timer.start();

    int cpu_sum = recursiveReduce(temp, size);
    timer.stop();
    float elapsedTime = timer.elapsedms();

    printf("cpu reduce time: %f,  sum: %d\n", elapsedTime, cpu_sum);

    //gpu sum
    cudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    timer.start();
    warmup<<<grid.x, block>>>(d_idata, d_odata, size);
    cudaDeviceSynchronize(); 
    timer.stop();
    float elapsedTime1 = timer.elapsedms();
    cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),cudaMemcpyDeviceToHost);
    gpu_sum = 0;
    for (int i = 0; i < grid.x; i ++){
        gpu_sum += h_odata[i];
    }
    printf("warm up reduce time: %f,  sum: %d\n", elapsedTime1, gpu_sum);


    //gpu sum
    cudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    timer.start();
    reduceGmem<<<grid.x, block>>>(d_idata, d_odata, size);
    cudaDeviceSynchronize(); 
    timer.stop();
    elapsedTime1 = timer.elapsedms();
    cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),cudaMemcpyDeviceToHost);
    gpu_sum = 0;
    for (int i = 0; i < grid.x ; i ++){
        gpu_sum += h_odata[i];
    }
    printf("reduceGmem gpu reduce time: %f,  sum: %d, gird ,block (%d %d)\n", elapsedTime1, gpu_sum, grid.x , block.x);

    //gpu sum
    cudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    timer.start();
    reduceSmem<<<grid.x, block>>>(d_idata, d_odata, size);
    cudaDeviceSynchronize(); 
    timer.stop();
    elapsedTime1 = timer.elapsedms();
    cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),cudaMemcpyDeviceToHost);
    gpu_sum = 0;
    for (int i = 0; i < grid.x ; i ++){
        gpu_sum += h_odata[i];
    }
    printf("reduceSmem gpu reduce time: %f,  sum: %d, gird ,block (%d %d)\n", elapsedTime1, gpu_sum, grid.x , block.x);


    cudaFree(d_idata);
    cudaFree(d_odata);
    cudaDeviceReset();

    free(h_idata);
    free(h_odata);
    free(temp);

    return 0;
}

通过nsys profile 程序:
nsys profile --stats=true reduce.exe
输出:

Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                   Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  --------------------------------------
     39.8           134721          1  134721.0  134721.0    134721    134721          0.0  warmup(int *, int *, unsigned int)
     38.0           128385          1  128385.0  128385.0    128385    128385          0.0  reduceGmem(int *, int *, unsigned int)
     22.2            75040          1   75040.0   75040.0     75040     75040          0.0  reduceSmem(int *, int *, unsigned int)

5.3.2 展开

展开的核函数以及调用:


__global__ void reduceSmemUnroll(int *g_idata, int *g_odata, unsigned int n){
    __shared__ int smem[DIM];

    unsigned int tid  = threadIdx.x;
    // convert global data pointer to local pointer
    int *idata = g_idata + blockIdx.x * blockDim.x;

    unsigned int idx = blockIdx.x * blockDim.x * 4 + threadIdx.x;
    
    //unrolling 4 blocks
    int tmpSum = 0;
    if (idx + 3 * blockDim.x <= n){
        int a1 = g_idata[idx];
        int a2 = g_idata[idx + blockDim.x];
        int a3 = g_idata[idx + 2 * blockDim.x];
        int a4 = g_idata[idx + 3 * blockDim.x];
        tmpSum = a1 + a2 + a3 + a4;
    }

    //set to smem by each threads
    smem[tid] = tmpSum;
    __syncthreads();

    if (blockDim.x >= 1024 &&  tid < 512) smem[tid] += smem[tid+ 512];
    __syncthreads();
    if (blockDim.x >= 512 &&  tid < 256) smem[tid] += smem[tid+ 256];
    __syncthreads();
    if (blockDim.x >= 256 &&  tid < 128) smem[tid] += smem[tid+ 128];
    __syncthreads();
    if (blockDim.x >= 128 &&  tid < 64) smem[tid] += smem[tid+ 64];
    __syncthreads();

    if (tid < 32){
        volatile int *vsmem  = smem;
        vsmem[tid] += vsmem[tid + 32];
        vsmem[tid] += vsmem[tid + 16];
        vsmem[tid] += vsmem[tid +  8];
        vsmem[tid] += vsmem[tid +  4];
        vsmem[tid] += vsmem[tid +  2];
        vsmem[tid] += vsmem[tid +  1];
    }

    if  (tid == 0){ g_odata[blockIdx.x] = smem[0];}
}


// 调用
cudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    timer.start();
    reduceSmemUnroll<<<grid.x /4 , block>>>(d_idata, d_odata, size);
    cudaDeviceSynchronize(); 
    timer.stop();
    elapsedTime1 = timer.elapsedms();
    cudaMemcpy(h_odata, d_odata, grid.x /4 * sizeof(int),cudaMemcpyDeviceToHost);
    gpu_sum = 0;
    for (int i = 0; i < grid.x / 4 ; i ++){
        gpu_sum += h_odata[i];
    }
    printf("reduceSmemUnroll gpu reduce time: %f,  sum: %d, gird ,block (%d %d)\n", elapsedTime1, gpu_sum, grid.x / 4, block.x);

nsys输出:

Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                      Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  --------------------------------------------
     34.6           135329          1  135329.0  135329.0    135329    135329          0.0  warmup(int *, int *, unsigned int)
     33.0           129056          1  129056.0  129056.0    129056    129056          0.0  reduceGmem(int *, int *, unsigned int)      
     25.6            99968          1   99968.0   99968.0     99968     99968          0.0  reduceSmem(int *, int *, unsigned int)
      6.7            26335          1   26335.0   26335.0     26335     26335          0.0  reduceSmemUnroll(int *, int *, unsigned int)

5.3.3 动态共享内存

//动态声明
extern __shared__ int smem[];

//调用
reduceSmemUnrollDyn<<<grid.x /4 , block, DIM * sizeof(int)>>>(d_idata, d_odata, size);

发现用动态分配共享内存实现的核函数和用静态分配共享内存实现的核函数之间没有显著的差异。

Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                       Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  -----------------------------------------------
     35.2           135009          1  135009.0  135009.0    135009    135009          0.0  warmup(int *, int *, unsigned int)
     33.5           128576          1  128576.0  128576.0    128576    128576          0.0  reduceGmem(int *, int *, unsigned int)
     19.5            74753          1   74753.0   74753.0     74753     74753          0.0  reduceSmem(int *, int *, unsigned int)
      5.9            22752          1   22752.0   22752.0     22752     22752          0.0  reduceSmemUnroll(int *, int *, unsigned int)
      5.9            22752          1   22752.0   22752.0     22752     22752          0.0  reduceSmemUnrollDyn(int *, int *, unsigned int)

相关文章:

  • 蓝桥杯牛客1-10重点(自用)
  • Tauri+React跨平台开发全场景问题解析
  • leetcode_字典树 140. 单词拆分 II
  • 普中51单片机和金沙滩51单片机的对比分析
  • 【实战 ES】实战 Elasticsearch:快速上手与深度实践-2.2.2线程池配置与写入限流
  • 【论文阅读笔记】SL-YOLO(2025/1/13) | 小目标检测 | HEPAN、C2fDCB轻量化模块
  • 【C++】使用 CMake 在 Windows 上自动化发布 C++/Qt 应用程序
  • 力扣-动态规划-300 最长递增子序列
  • AI预测福彩3D新模型百十个定位预测+胆码预测+杀和尾+杀和值2025年3月3日第11弹
  • VS2022远程调试Ubuntu中的C++程序
  • Windows10下本地搭建Manim环境
  • 【AVRCP】探寻AVRCP控制互操作性:连接、命令与设备交互
  • ArcGIS操作:10 投影坐标系转地理坐标系
  • Day 55 卡玛笔记
  • 华为 VRP 系统简介配置SSH,TELNET远程登录
  • SqlSugar 语法糖推荐方式
  • 【弹框组件封装】展示、打印、下载XX表(Base64格式图片)
  • win11编译pytorchvision cuda128版本流程
  • C++中读取与保存不同维度的csv数据
  • 计算机毕业设计SpringBoot+Vue.js图书馆管理系统(源码+文档+PPT+讲解)
  • 泽连斯基:俄代表团级别低,没人能做决定
  • 北京警方:海淀发生小客车刮碰行人事故4人受伤,肇事司机已被查获
  • 国防部:菲方应停止一切侵权挑衅危险举动,否则只会自食苦果
  • 缅甸内观冥想的历史漂流:从“人民鸦片”到东方灵修
  • 押井守在30年前创造的虚拟世界何以比当下更超前?
  • 玉渊谭天丨卢拉谈美国降低对华关税:中国的行动捍卫了主权