当前位置: 首页 > news >正文

【cuda学习日记】4.3 结构体数组与数组结构体

4.3 数组结构体(AoS)和结构体数组(SoA)

AoS方法进行存储

struct innerStruct{
	float x;
	float y;
};

struct innerStruct myAOS[N];

SoA方法来存储数据

struct innerArray{
	float x[N];
	float y[N];
};

struct innerArray moa;

如图说明了AoS和SoA方法的内存布局,用AoS模式在GPU上存储示例数据并执行一个只有x字段的应用程序,将导致50%的带宽损失
在这里插入图片描述
4.3.1 简单示例AoS

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>

#define LEN 1 << 20

struct innerStruct{
	float x;
	float y;
};


struct innerArray{
	float x[LEN];
	float y[LEN];
};


__global__ void testInnerStruct(innerStruct *data, innerStruct *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n){
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

__global__ void warmup(innerStruct *data, innerStruct *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n){
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

void testInnerStructHost(innerStruct *data, innerStruct *result, const int n){
    for (int i = 0; i < n ; i ++){
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

void initialInnerStruct(innerStruct *ip,  int size)
{
    for (int i = 0; i < size; i++)
    {
        ip[i].x = (float)(rand() & 0xFF) / 100.0f;
        ip[i].y = (float)(rand() & 0xFF) / 100.0f;
    }

    return;
}

void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i].x - gpuRef[i].x) > epsilon)
        {
            match = 0;
            printf("different on %dth element: host %f gpu %f\n", i,
                    hostRef[i].x, gpuRef[i].x);
            break;
        }

        if (abs(hostRef[i].y - gpuRef[i].y) > epsilon)
        {
            match = 0;
            printf("different on %dth element: host %f gpu %f\n", i,
                    hostRef[i].y, gpuRef[i].y);
            break;
        }
    }

    if (!match)  printf("Arrays do not match.\n\n");
}


int main(int argc, char ** argv){
    int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("device %d: %s \n", dev, deviceprop.name);

    int nElem = LEN;
    size_t nBytes = nElem * sizeof(innerStruct);

    innerStruct     *h_A = (innerStruct *)malloc(nBytes);
    innerStruct *hostRef = (innerStruct *)malloc(nBytes);
    innerStruct *gpuRef  = (innerStruct *)malloc(nBytes);
    
    initialInnerStruct(h_A, nElem);
    testInnerStructHost(h_A, hostRef, nElem);

    innerStruct *d_A, *d_C;
    cudaMalloc((innerStruct**)&d_A, nBytes);
    cudaMalloc((innerStruct**)&d_C, nBytes);
    

    cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);

    int blocksize = 128;
    if (argc > 1) blocksize = atoi(argv[1]);

    dim3 block(blocksize,1);
    dim3 grid((nElem + block.x - 1)/block.x, 1);

    Timer timer;
    timer.start();
    warmup<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    float elapsedTime = timer.elapsedms();
    printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    timer.start();
    testInnerStruct<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("testInnerStruct <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
    checkInnerStruct(hostRef, gpuRef, nElem);


    cudaFree(d_A);
    cudaFree(d_C);
    free(h_A);
    free(hostRef);
    free(gpuRef);

    cudaDeviceReset();
    return 0;
}

用NCU查看加载内存效率,只有50%:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 50

执行时间:
testInnerStruct <<<8192, 128>>> elapsed 0.036864 ms

4.3.1 简单示例SoA

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>

#define LEN 1 << 20

struct innerStruct{
	float x;
	float y;
};


struct innerArray{
	float x[LEN];
	float y[LEN];
};

__global__ void testInnerArray( innerArray *data, innerArray *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        float tmpx = data -> x[i];
        float tmpy = data -> y[i];

        tmpx += 10.0f;
        tmpy += 20.0f;
        result -> x[i] = tmpx;
        result -> y[i] = tmpy;
    }
}

__global__ void warmup( innerArray *data, innerArray *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        float tmpx = data -> x[i];
        float tmpy = data -> y[i];

        tmpx += 10.0f;
        tmpy += 20.0f;
        result -> x[i] = tmpx;
        result -> y[i] = tmpy;
    }
}

// functions for inner array outer struct
void initialInnerArray(innerArray *ip,  int size)
{
    for (int i = 0; i < size; i++)
    {
        ip->x[i] = (float)( rand() & 0xFF ) / 100.0f;
        ip->y[i] = (float)( rand() & 0xFF ) / 100.0f;
    }

    return;
}

void testInnerArrayHost(innerArray *A, innerArray *C, const int n)
{
    for (int idx = 0; idx < n; idx++)
    {
        C->x[idx] = A->x[idx] + 10.f;
        C->y[idx] = A->y[idx] + 20.f;
    }

    return;
}

void checkInnerArray(innerArray *hostRef, innerArray *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon)
        {
            match = 0;
            printf("different on x %dth element: host %f gpu %f\n", i,
                   hostRef->x[i], gpuRef->x[i]);
            break;
        }

        if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon)
        {
            match = 0;
            printf("different on y %dth element: host %f gpu %f\n", i,
                   hostRef->y[i], gpuRef->y[i]);
            break;
        }
    }

    if (!match)  printf("Arrays do not match.\n\n");
}


int main(int argc, char ** argv){
    int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("device %d: %s \n", dev, deviceprop.name);

    int nElem = LEN;
    size_t nBytes =   sizeof(innerArray);

    innerArray     *h_A = (innerArray *)malloc(nBytes);
    innerArray *hostRef = (innerArray *)malloc(nBytes);
    innerArray *gpuRef  = (innerArray *)malloc(nBytes);
    
    initialInnerArray(h_A, nElem);
    testInnerArrayHost(h_A, hostRef, nElem);

    innerArray *d_A, *d_C;
    cudaMalloc((innerArray**)&d_A, nBytes);
    cudaMalloc((innerArray**)&d_C, nBytes);
    

    cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);

    int blocksize = 128;
    if (argc > 1) blocksize = atoi(argv[1]);

    dim3 block(blocksize,1);
    dim3 grid((nElem + block.x - 1)/block.x, 1);

    Timer timer;
    timer.start();
    warmup<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    float elapsedTime = timer.elapsedms();
    printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    timer.start();
    testInnerArray<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("testInnerArray <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
    checkInnerArray(hostRef, gpuRef, nElem);


    cudaFree(d_A);
    cudaFree(d_C);
    free(h_A);
    free(hostRef);
    free(gpuRef);

    cudaDeviceReset();
    return 0;
}

NCU查看内存加载效率:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 100

优化设备内存带宽利用率有两个目标:
·对齐及合并内存访问,以减少带宽的浪费
·足够的并发内存操作,以隐藏内存延迟


文章转载自:

http://oDpaaFvk.mrfgy.cn
http://SCWq6Vt4.mrfgy.cn
http://nxNFmXLt.mrfgy.cn
http://DRLkdJkT.mrfgy.cn
http://lkBA8Sa5.mrfgy.cn
http://H1CyOrNe.mrfgy.cn
http://0pM2We2v.mrfgy.cn
http://Uqsfmpn3.mrfgy.cn
http://QgtIujUm.mrfgy.cn
http://YYhUn0HK.mrfgy.cn
http://WNjVQXmu.mrfgy.cn
http://5jqICi1a.mrfgy.cn
http://BpmuxQ7A.mrfgy.cn
http://Xe4HIGPx.mrfgy.cn
http://KTUDDKMo.mrfgy.cn
http://GuEkkxD4.mrfgy.cn
http://GHXBbZSM.mrfgy.cn
http://ETkgFGQX.mrfgy.cn
http://5uKd5DLr.mrfgy.cn
http://lVkApNgA.mrfgy.cn
http://4JgAztHB.mrfgy.cn
http://PLmb7Tqs.mrfgy.cn
http://bhaHEIl2.mrfgy.cn
http://4Fwkb8xk.mrfgy.cn
http://8rwQf7lG.mrfgy.cn
http://8sALWU5c.mrfgy.cn
http://y4ybf5Z7.mrfgy.cn
http://O0rZLimn.mrfgy.cn
http://9KkfEJaD.mrfgy.cn
http://pEBfPV1Q.mrfgy.cn
http://www.dtcms.com/a/36099.html

相关文章:

  • 【C++】CentOS环境搭建-安装log4cplus日志组件包及报错解决方案
  • SOME/IP-SD -- 协议英文原文讲解2
  • Git 分支操作
  • 【Redis 原理】通信协议 内存回收
  • [特殊字符] 蓝桥杯 Java B 组 之最小生成树(Prim、Kruskal) 并查集应用
  • 无人机+DeepSeek:放飞自我的智能化技术详解!
  • java23种设计模式-抽象工厂模式
  • DeepSeek-R1:通过强化学习激励大语言模型的推理能力
  • 陀螺匠·企业助手v1.8 产品介绍
  • c++_string模拟实现
  • Eureka、ZooKeeper 和 Nacos 之间的对比
  • YOLO11改进-模块-引入混合结构模块Mix Structure Block 提高多尺度、小目标
  • 使用Windbg调试目标进程排查C++软件异常的一般步骤与要点分享
  • 6层高速PCB设计入门第1~10讲
  • STM32CUBEIDE FreeRTOS操作教程(十三):task api 任务访问函数
  • 原生稀疏注意力NSA 替换transformer 注意力进行文本生成训练
  • Web自动化之Selenium添加网站Cookies实现免登录
  • C++ ——— 二叉搜索树
  • EasyExcel 使用指南:基础操作与常见问题
  • MySQL 最左前缀原则:原理、应用与优化
  • Winform工具箱、属性、事件
  • 04基于vs2022的c语言笔记——数据类型
  • C# httpclient 和 Flurl.Http 的测试
  • Mesh自组网技术及应用
  • Threejs教程三【揭秘3D贴图魔法】
  • 如何使用爬虫获取淘宝商品详情:API返回值说明与案例指南
  • Unity 第三人称人物切动画时人物莫名旋转
  • 3.18 ReAct 理论实战:构建动态推理-行动循环的企业级 Agent
  • pycharm技巧--鼠标滚轮放大或缩小 Pycharm 字体大小
  • ESP8266+STM32+阿里云保姆级教程(AT指令+MQTT)