当前位置：首页 > news >正文

【cuda学习日记】4.3 结构体数组与数组结构体

news 2025/10/26 16:10:56

4.3 数组结构体（AoS）和结构体数组（SoA）

AoS方法进行存储

struct innerStruct{
	float x;
	float y;
};

struct innerStruct myAOS[N];

SoA方法来存储数据

struct innerArray{
	float x[N];
	float y[N];
};

struct innerArray moa;

如图说明了AoS和SoA方法的内存布局，用AoS模式在GPU上存储示例数据并执行一个只有x字段的应用程序，将导致50%的带宽损失
在这里插入图片描述
4.3.1 简单示例AoS

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>

#define LEN 1 << 20

struct innerStruct{
	float x;
	float y;
};


struct innerArray{
	float x[LEN];
	float y[LEN];
};


__global__ void testInnerStruct(innerStruct *data, innerStruct *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n){
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

__global__ void warmup(innerStruct *data, innerStruct *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n){
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

void testInnerStructHost(innerStruct *data, innerStruct *result, const int n){
    for (int i = 0; i < n ; i ++){
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

void initialInnerStruct(innerStruct *ip,  int size)
{
    for (int i = 0; i < size; i++)
    {
        ip[i].x = (float)(rand() & 0xFF) / 100.0f;
        ip[i].y = (float)(rand() & 0xFF) / 100.0f;
    }

    return;
}

void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i].x - gpuRef[i].x) > epsilon)
        {
            match = 0;
            printf("different on %dth element: host %f gpu %f\n", i,
                    hostRef[i].x, gpuRef[i].x);
            break;
        }

        if (abs(hostRef[i].y - gpuRef[i].y) > epsilon)
        {
            match = 0;
            printf("different on %dth element: host %f gpu %f\n", i,
                    hostRef[i].y, gpuRef[i].y);
            break;
        }
    }

    if (!match)  printf("Arrays do not match.\n\n");
}


int main(int argc, char ** argv){
    int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("device %d: %s \n", dev, deviceprop.name);

    int nElem = LEN;
    size_t nBytes = nElem * sizeof(innerStruct);

    innerStruct     *h_A = (innerStruct *)malloc(nBytes);
    innerStruct *hostRef = (innerStruct *)malloc(nBytes);
    innerStruct *gpuRef  = (innerStruct *)malloc(nBytes);
    
    initialInnerStruct(h_A, nElem);
    testInnerStructHost(h_A, hostRef, nElem);

    innerStruct *d_A, *d_C;
    cudaMalloc((innerStruct**)&d_A, nBytes);
    cudaMalloc((innerStruct**)&d_C, nBytes);
    

    cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);

    int blocksize = 128;
    if (argc > 1) blocksize = atoi(argv[1]);

    dim3 block(blocksize,1);
    dim3 grid((nElem + block.x - 1)/block.x, 1);

    Timer timer;
    timer.start();
    warmup<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    float elapsedTime = timer.elapsedms();
    printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    timer.start();
    testInnerStruct<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("testInnerStruct <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
    checkInnerStruct(hostRef, gpuRef, nElem);


    cudaFree(d_A);
    cudaFree(d_C);
    free(h_A);
    free(hostRef);
    free(gpuRef);

    cudaDeviceReset();
    return 0;
}

用NCU查看加载内存效率，只有50%：
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 50

执行时间：
testInnerStruct <<<8192, 128>>> elapsed 0.036864 ms

4.3.1 简单示例SoA

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>

#define LEN 1 << 20

struct innerStruct{
	float x;
	float y;
};


struct innerArray{
	float x[LEN];
	float y[LEN];
};

__global__ void testInnerArray( innerArray *data, innerArray *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        float tmpx = data -> x[i];
        float tmpy = data -> y[i];

        tmpx += 10.0f;
        tmpy += 20.0f;
        result -> x[i] = tmpx;
        result -> y[i] = tmpy;
    }
}

__global__ void warmup( innerArray *data, innerArray *result, const int n){
    unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        float tmpx = data -> x[i];
        float tmpy = data -> y[i];

        tmpx += 10.0f;
        tmpy += 20.0f;
        result -> x[i] = tmpx;
        result -> y[i] = tmpy;
    }
}

// functions for inner array outer struct
void initialInnerArray(innerArray *ip,  int size)
{
    for (int i = 0; i < size; i++)
    {
        ip->x[i] = (float)( rand() & 0xFF ) / 100.0f;
        ip->y[i] = (float)( rand() & 0xFF ) / 100.0f;
    }

    return;
}

void testInnerArrayHost(innerArray *A, innerArray *C, const int n)
{
    for (int idx = 0; idx < n; idx++)
    {
        C->x[idx] = A->x[idx] + 10.f;
        C->y[idx] = A->y[idx] + 20.f;
    }

    return;
}

void checkInnerArray(innerArray *hostRef, innerArray *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon)
        {
            match = 0;
            printf("different on x %dth element: host %f gpu %f\n", i,
                   hostRef->x[i], gpuRef->x[i]);
            break;
        }

        if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon)
        {
            match = 0;
            printf("different on y %dth element: host %f gpu %f\n", i,
                   hostRef->y[i], gpuRef->y[i]);
            break;
        }
    }

    if (!match)  printf("Arrays do not match.\n\n");
}


int main(int argc, char ** argv){
    int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("device %d: %s \n", dev, deviceprop.name);

    int nElem = LEN;
    size_t nBytes =   sizeof(innerArray);

    innerArray     *h_A = (innerArray *)malloc(nBytes);
    innerArray *hostRef = (innerArray *)malloc(nBytes);
    innerArray *gpuRef  = (innerArray *)malloc(nBytes);
    
    initialInnerArray(h_A, nElem);
    testInnerArrayHost(h_A, hostRef, nElem);

    innerArray *d_A, *d_C;
    cudaMalloc((innerArray**)&d_A, nBytes);
    cudaMalloc((innerArray**)&d_C, nBytes);
    

    cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);

    int blocksize = 128;
    if (argc > 1) blocksize = atoi(argv[1]);

    dim3 block(blocksize,1);
    dim3 grid((nElem + block.x - 1)/block.x, 1);

    Timer timer;
    timer.start();
    warmup<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    float elapsedTime = timer.elapsedms();
    printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    timer.start();
    testInnerArray<<<grid,block>>>(d_A, d_C, nElem);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("testInnerArray <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);

    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
    checkInnerArray(hostRef, gpuRef, nElem);


    cudaFree(d_A);
    cudaFree(d_C);
    free(h_A);
    free(hostRef);
    free(gpuRef);

    cudaDeviceReset();
    return 0;
}