当前位置: 首页 > wzjs >正文

长沙优化网站获客软件百度趋势搜索

长沙优化网站获客软件,百度趋势搜索,传媒公司 网站开发,长治建设工程交易网在第3章讨论过的并行归约问题 5.3.1 使用共享内存进行并行归约 reduceGmem – 使用全局内存作为基准 reduceSmem – 使用共享内存 #include <cuda_runtime.h> #include <stdio.h> #include "../common/common.h" #include <iostream>#define DIM…

在第3章讨论过的并行归约问题

5.3.1 使用共享内存进行并行归约
reduceGmem – 使用全局内存作为基准
reduceSmem – 使用共享内存

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>#define DIM 128int recursiveReduce(int *data, int const size){if (size == 1) return data[0];int const stride = size /2;for (int i = 0; i < stride; i ++){data[i] += data[i + stride];}return recursiveReduce( data, stride);
}__global__ void warmup( int *g_idata, int *g_odata, unsigned int n){unsigned int tid  = threadIdx.x;int *idata = g_idata + blockIdx.x * blockDim.x;unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;if (idx >= n) return;if (blockDim.x >= 1024 &&  tid < 512) idata[tid] += idata[tid+ 512];__syncthreads();if (blockDim.x >= 512 &&  tid < 256) idata[tid] += idata[tid+ 256];__syncthreads();if (blockDim.x >= 256 &&  tid < 128) idata[tid] += idata[tid+ 128];__syncthreads();if (blockDim.x >= 128 &&  tid < 64) idata[tid] += idata[tid+ 64];__syncthreads();if (tid < 32){volatile int *vmem  = idata;vmem[tid] += vmem[tid + 32];vmem[tid] += vmem[tid + 16];vmem[tid] += vmem[tid +  8];vmem[tid] += vmem[tid +  4];vmem[tid] += vmem[tid +  2];vmem[tid] += vmem[tid +  1];}if  (tid == 0){ g_odata[blockIdx.x] = idata[0];}
}__global__ void reduceGmem( int *g_idata, int *g_odata, unsigned int n){unsigned int tid  = threadIdx.x;int *idata = g_idata + blockIdx.x * blockDim.x;unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;if (idx >= n) return;if (blockDim.x >= 1024 &&  tid < 512) idata[tid] += idata[tid+ 512];__syncthreads();if (blockDim.x >= 512 &&  tid < 256) idata[tid] += idata[tid+ 256];__syncthreads();if (blockDim.x >= 256 &&  tid < 128) idata[tid] += idata[tid+ 128];__syncthreads();if (blockDim.x >= 128 &&  tid < 64) idata[tid] += idata[tid+ 64];__syncthreads();if (tid < 32){volatile int *vmem  = idata;vmem[tid] += vmem[tid + 32];vmem[tid] += vmem[tid + 16];vmem[tid] += vmem[tid +  8];vmem[tid] += vmem[tid +  4];vmem[tid] += vmem[tid +  2];vmem[tid] += vmem[tid +  1];}if  (tid == 0){ g_odata[blockIdx.x] = idata[0];}
}__global__ void reduceSmem(int *g_idata, int *g_odata, unsigned int n){__shared__ int smem[DIM];unsigned int tid  = threadIdx.x;// convert global data pointer to local pointerint *idata = g_idata + blockIdx.x * blockDim.x;unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;if (idx >= n) return;//set to smem by each threadssmem[tid] = idata[tid];__syncthreads();if (blockDim.x >= 1024 &&  tid < 512) smem[tid] += smem[tid+ 512];__syncthreads();if (blockDim.x >= 512 &&  tid < 256) smem[tid] += smem[tid+ 256];__syncthreads();if (blockDim.x >= 256 &&  tid < 128) smem[tid] += smem[tid+ 128];__syncthreads();if (blockDim.x >= 128 &&  tid < 64) smem[tid] += smem[tid+ 64];__syncthreads();if (tid < 32){volatile int *vsmem  = smem;vsmem[tid] += vsmem[tid + 32];vsmem[tid] += vsmem[tid + 16];vsmem[tid] += vsmem[tid +  8];vsmem[tid] += vsmem[tid +  4];vsmem[tid] += vsmem[tid +  2];vsmem[tid] += vsmem[tid +  1];}if  (tid == 0){ g_odata[blockIdx.x] = smem[0];}
}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("Using Device %d : %s\n", dev, deviceprop.name);int size = 1 << 24;int blocksize = 512;if (argc > 1){blocksize = atoi(argv[1]);}dim3 block(DIM, 1);  // 1ddim3 grid ((size + block.x - 1) / block.x, 1);size_t nBytes = size  * sizeof(int);int * h_idata = (int*) malloc(nBytes);int * h_odata = (int*) malloc( grid.x * sizeof(int));  //you duoshao ge blockint * temp = (int*) malloc(nBytes);//initial the arrayfor (int i = 0 ; i < size;i++){h_idata[i] = (int)(rand() & 0xff);}int sum = 0;for (int i = 0 ; i < size;i++){sum += h_idata[i];}printf("sum value is : %d\n", sum);memcpy(temp, h_idata, nBytes);int gpu_sum = 0;int *d_idata = NULL;int *d_odata = NULL;cudaMalloc((void**)&d_idata, nBytes);cudaMalloc((void**)&d_odata, grid.x * sizeof(int));//cpu sumTimer timer;timer.start();int cpu_sum = recursiveReduce(temp, size);timer.stop();float elapsedTime = timer.elapsedms();printf("cpu reduce time: %f,  sum: %d\n", elapsedTime, cpu_sum);//gpu sumcudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);cudaDeviceSynchronize();timer.start();warmup<<<grid.x, block>>>(d_idata, d_odata, size);cudaDeviceSynchronize(); timer.stop();float elapsedTime1 = timer.elapsedms();cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x; i ++){gpu_sum += h_odata[i];}printf("warm up reduce time: %f,  sum: %d\n", elapsedTime1, gpu_sum);//gpu sumcudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);cudaDeviceSynchronize();timer.start();reduceGmem<<<grid.x, block>>>(d_idata, d_odata, size);cudaDeviceSynchronize(); timer.stop();elapsedTime1 = timer.elapsedms();cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x ; i ++){gpu_sum += h_odata[i];}printf("reduceGmem gpu reduce time: %f,  sum: %d, gird ,block (%d %d)\n", elapsedTime1, gpu_sum, grid.x , block.x);//gpu sumcudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);cudaDeviceSynchronize();timer.start();reduceSmem<<<grid.x, block>>>(d_idata, d_odata, size);cudaDeviceSynchronize(); timer.stop();elapsedTime1 = timer.elapsedms();cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x ; i ++){gpu_sum += h_odata[i];}printf("reduceSmem gpu reduce time: %f,  sum: %d, gird ,block (%d %d)\n", elapsedTime1, gpu_sum, grid.x , block.x);cudaFree(d_idata);cudaFree(d_odata);cudaDeviceReset();free(h_idata);free(h_odata);free(temp);return 0;
}

通过nsys profile 程序:
nsys profile --stats=true reduce.exe
输出:

Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                   Name--------  ---------------  ---------  --------  --------  --------  --------  -----------  --------------------------------------39.8           134721          1  134721.0  134721.0    134721    134721          0.0  warmup(int *, int *, unsigned int)38.0           128385          1  128385.0  128385.0    128385    128385          0.0  reduceGmem(int *, int *, unsigned int)22.2            75040          1   75040.0   75040.0     75040     75040          0.0  reduceSmem(int *, int *, unsigned int)

5.3.2 展开

展开的核函数以及调用:


__global__ void reduceSmemUnroll(int *g_idata, int *g_odata, unsigned int n){__shared__ int smem[DIM];unsigned int tid  = threadIdx.x;// convert global data pointer to local pointerint *idata = g_idata + blockIdx.x * blockDim.x;unsigned int idx = blockIdx.x * blockDim.x * 4 + threadIdx.x;//unrolling 4 blocksint tmpSum = 0;if (idx + 3 * blockDim.x <= n){int a1 = g_idata[idx];int a2 = g_idata[idx + blockDim.x];int a3 = g_idata[idx + 2 * blockDim.x];int a4 = g_idata[idx + 3 * blockDim.x];tmpSum = a1 + a2 + a3 + a4;}//set to smem by each threadssmem[tid] = tmpSum;__syncthreads();if (blockDim.x >= 1024 &&  tid < 512) smem[tid] += smem[tid+ 512];__syncthreads();if (blockDim.x >= 512 &&  tid < 256) smem[tid] += smem[tid+ 256];__syncthreads();if (blockDim.x >= 256 &&  tid < 128) smem[tid] += smem[tid+ 128];__syncthreads();if (blockDim.x >= 128 &&  tid < 64) smem[tid] += smem[tid+ 64];__syncthreads();if (tid < 32){volatile int *vsmem  = smem;vsmem[tid] += vsmem[tid + 32];vsmem[tid] += vsmem[tid + 16];vsmem[tid] += vsmem[tid +  8];vsmem[tid] += vsmem[tid +  4];vsmem[tid] += vsmem[tid +  2];vsmem[tid] += vsmem[tid +  1];}if  (tid == 0){ g_odata[blockIdx.x] = smem[0];}
}// 调用
cudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);cudaDeviceSynchronize();timer.start();reduceSmemUnroll<<<grid.x /4 , block>>>(d_idata, d_odata, size);cudaDeviceSynchronize(); timer.stop();elapsedTime1 = timer.elapsedms();cudaMemcpy(h_odata, d_odata, grid.x /4 * sizeof(int),cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x / 4 ; i ++){gpu_sum += h_odata[i];}printf("reduceSmemUnroll gpu reduce time: %f,  sum: %d, gird ,block (%d %d)\n", elapsedTime1, gpu_sum, grid.x / 4, block.x);

nsys输出:

Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                      Name--------  ---------------  ---------  --------  --------  --------  --------  -----------  --------------------------------------------34.6           135329          1  135329.0  135329.0    135329    135329          0.0  warmup(int *, int *, unsigned int)33.0           129056          1  129056.0  129056.0    129056    129056          0.0  reduceGmem(int *, int *, unsigned int)      25.6            99968          1   99968.0   99968.0     99968     99968          0.0  reduceSmem(int *, int *, unsigned int)6.7            26335          1   26335.0   26335.0     26335     26335          0.0  reduceSmemUnroll(int *, int *, unsigned int)

5.3.3 动态共享内存

//动态声明
extern __shared__ int smem[];//调用
reduceSmemUnrollDyn<<<grid.x /4 , block, DIM * sizeof(int)>>>(d_idata, d_odata, size);

发现用动态分配共享内存实现的核函数和用静态分配共享内存实现的核函数之间没有显著的差异。

Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                       Name--------  ---------------  ---------  --------  --------  --------  --------  -----------  -----------------------------------------------35.2           135009          1  135009.0  135009.0    135009    135009          0.0  warmup(int *, int *, unsigned int)33.5           128576          1  128576.0  128576.0    128576    128576          0.0  reduceGmem(int *, int *, unsigned int)19.5            74753          1   74753.0   74753.0     74753     74753          0.0  reduceSmem(int *, int *, unsigned int)5.9            22752          1   22752.0   22752.0     22752     22752          0.0  reduceSmemUnroll(int *, int *, unsigned int)5.9            22752          1   22752.0   22752.0     22752     22752          0.0  reduceSmemUnrollDyn(int *, int *, unsigned int)
http://www.dtcms.com/wzjs/345094.html

相关文章:

  • 郑州三牛网站建设江苏网页定制
  • 网站的建设过程杭州百度百科
  • 做网站使用明星照片可以吗seo资料网
  • 网络怎么设计优化大师在哪里
  • 广东营销网站建设服务公司今天的新闻
  • 自己搭建网站优化设计三要素
  • 响应式网站制作方法成品网站源码1688免费推荐
  • 网站安装部署seo关键词排名优化怎样收费
  • 上海石化有做网站设计的吗网络域名怎么查
  • 哈 做网站营销网站建设流程
  • 吉首市建设局官方网站销售怎么做
  • 佛山高端网站设计站长工具端口扫描
  • 西双版纳网站建设竞价推广托管开户
  • 找人做网站需要准备什么材料2345网址导航设置
  • 网站建设html5源码灰色行业关键词推广
  • 企业做网站应该注意的问题网站运营培训
  • 常见的网络广告seo网站内部优化
  • 欧美做同志网站有哪些114网址大全
  • 普通网站和营销网站有何不同河南做网站的
  • 下载应用市场seo排名点击软件推荐
  • 网站怎么做播放窗口百度广告投诉电话客服24小时
  • 互联网软件有哪些seo视频网页入口网站推广
  • 朝阳网站建设 国展文山seo公司
  • 在淘宝做网站和网络公司做网站区别百度论坛首页
  • 网站服务器转移视频吗大同优化推广
  • 全球网站建设服务商中小企业管理培训课程
  • 日照住房和城乡建设厅网站“跨年”等关键词搜索达年内峰值
  • 重庆企业网站推广服务线上培训机构有哪些
  • 柳市做公司网站免费观看行情软件网站进入
  • 统计网站的代码百度惠生活商家怎么入驻