【cuda学习日记】4.3 结构体数组与数组结构体
4.3 数组结构体(AoS)和结构体数组(SoA)
AoS方法进行存储
struct innerStruct{
float x;
float y;
};
struct innerStruct myAOS[N];
SoA方法来存储数据
struct innerArray{
float x[N];
float y[N];
};
struct innerArray moa;
如图说明了AoS和SoA方法的内存布局,用AoS模式在GPU上存储示例数据并执行一个只有x字段的应用程序,将导致50%的带宽损失
4.3.1 简单示例AoS
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>
#define LEN 1 << 20
struct innerStruct{
float x;
float y;
};
struct innerArray{
float x[LEN];
float y[LEN];
};
__global__ void testInnerStruct(innerStruct *data, innerStruct *result, const int n){
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n){
innerStruct tmp = data[i];
tmp.x += 10.f;
tmp.y += 20.f;
result[i] = tmp;
}
}
__global__ void warmup(innerStruct *data, innerStruct *result, const int n){
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n){
innerStruct tmp = data[i];
tmp.x += 10.f;
tmp.y += 20.f;
result[i] = tmp;
}
}
void testInnerStructHost(innerStruct *data, innerStruct *result, const int n){
for (int i = 0; i < n ; i ++){
innerStruct tmp = data[i];
tmp.x += 10.f;
tmp.y += 20.f;
result[i] = tmp;
}
}
void initialInnerStruct(innerStruct *ip, int size)
{
for (int i = 0; i < size; i++)
{
ip[i].x = (float)(rand() & 0xFF) / 100.0f;
ip[i].y = (float)(rand() & 0xFF) / 100.0f;
}
return;
}
void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i].x - gpuRef[i].x) > epsilon)
{
match = 0;
printf("different on %dth element: host %f gpu %f\n", i,
hostRef[i].x, gpuRef[i].x);
break;
}
if (abs(hostRef[i].y - gpuRef[i].y) > epsilon)
{
match = 0;
printf("different on %dth element: host %f gpu %f\n", i,
hostRef[i].y, gpuRef[i].y);
break;
}
}
if (!match) printf("Arrays do not match.\n\n");
}
int main(int argc, char ** argv){
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("device %d: %s \n", dev, deviceprop.name);
int nElem = LEN;
size_t nBytes = nElem * sizeof(innerStruct);
innerStruct *h_A = (innerStruct *)malloc(nBytes);
innerStruct *hostRef = (innerStruct *)malloc(nBytes);
innerStruct *gpuRef = (innerStruct *)malloc(nBytes);
initialInnerStruct(h_A, nElem);
testInnerStructHost(h_A, hostRef, nElem);
innerStruct *d_A, *d_C;
cudaMalloc((innerStruct**)&d_A, nBytes);
cudaMalloc((innerStruct**)&d_C, nBytes);
cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);
int blocksize = 128;
if (argc > 1) blocksize = atoi(argv[1]);
dim3 block(blocksize,1);
dim3 grid((nElem + block.x - 1)/block.x, 1);
Timer timer;
timer.start();
warmup<<<grid,block>>>(d_A, d_C, nElem);
cudaDeviceSynchronize();
timer.stop();
float elapsedTime = timer.elapsedms();
printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);
timer.start();
testInnerStruct<<<grid,block>>>(d_A, d_C, nElem);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
printf("testInnerStruct <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
checkInnerStruct(hostRef, gpuRef, nElem);
cudaFree(d_A);
cudaFree(d_C);
free(h_A);
free(hostRef);
free(gpuRef);
cudaDeviceReset();
return 0;
}
用NCU查看加载内存效率,只有50%:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 50
执行时间:
testInnerStruct <<<8192, 128>>> elapsed 0.036864 ms
4.3.1 简单示例SoA
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>
#define LEN 1 << 20
struct innerStruct{
float x;
float y;
};
struct innerArray{
float x[LEN];
float y[LEN];
};
__global__ void testInnerArray( innerArray *data, innerArray *result, const int n){
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
float tmpx = data -> x[i];
float tmpy = data -> y[i];
tmpx += 10.0f;
tmpy += 20.0f;
result -> x[i] = tmpx;
result -> y[i] = tmpy;
}
}
__global__ void warmup( innerArray *data, innerArray *result, const int n){
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
float tmpx = data -> x[i];
float tmpy = data -> y[i];
tmpx += 10.0f;
tmpy += 20.0f;
result -> x[i] = tmpx;
result -> y[i] = tmpy;
}
}
// functions for inner array outer struct
void initialInnerArray(innerArray *ip, int size)
{
for (int i = 0; i < size; i++)
{
ip->x[i] = (float)( rand() & 0xFF ) / 100.0f;
ip->y[i] = (float)( rand() & 0xFF ) / 100.0f;
}
return;
}
void testInnerArrayHost(innerArray *A, innerArray *C, const int n)
{
for (int idx = 0; idx < n; idx++)
{
C->x[idx] = A->x[idx] + 10.f;
C->y[idx] = A->y[idx] + 20.f;
}
return;
}
void checkInnerArray(innerArray *hostRef, innerArray *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon)
{
match = 0;
printf("different on x %dth element: host %f gpu %f\n", i,
hostRef->x[i], gpuRef->x[i]);
break;
}
if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon)
{
match = 0;
printf("different on y %dth element: host %f gpu %f\n", i,
hostRef->y[i], gpuRef->y[i]);
break;
}
}
if (!match) printf("Arrays do not match.\n\n");
}
int main(int argc, char ** argv){
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("device %d: %s \n", dev, deviceprop.name);
int nElem = LEN;
size_t nBytes = sizeof(innerArray);
innerArray *h_A = (innerArray *)malloc(nBytes);
innerArray *hostRef = (innerArray *)malloc(nBytes);
innerArray *gpuRef = (innerArray *)malloc(nBytes);
initialInnerArray(h_A, nElem);
testInnerArrayHost(h_A, hostRef, nElem);
innerArray *d_A, *d_C;
cudaMalloc((innerArray**)&d_A, nBytes);
cudaMalloc((innerArray**)&d_C, nBytes);
cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);
int blocksize = 128;
if (argc > 1) blocksize = atoi(argv[1]);
dim3 block(blocksize,1);
dim3 grid((nElem + block.x - 1)/block.x, 1);
Timer timer;
timer.start();
warmup<<<grid,block>>>(d_A, d_C, nElem);
cudaDeviceSynchronize();
timer.stop();
float elapsedTime = timer.elapsedms();
printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);
timer.start();
testInnerArray<<<grid,block>>>(d_A, d_C, nElem);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
printf("testInnerArray <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
checkInnerArray(hostRef, gpuRef, nElem);
cudaFree(d_A);
cudaFree(d_C);
free(h_A);
free(hostRef);
free(gpuRef);
cudaDeviceReset();
return 0;
}
NCU查看内存加载效率:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 100
优化设备内存带宽利用率有两个目标:
·对齐及合并内存访问,以减少带宽的浪费
·足够的并发内存操作,以隐藏内存延迟