结构体数组（SoA）与数组结构体（AoS）

最新推荐文章于 2025-03-31 20:33:50 发布

rtoax

最新推荐文章于 2025-03-31 20:33:50 发布

阅读量8.8k

点赞数 1

分类专栏：【语言与编译】【高性能计算】

【语言与编译】同时被 2 个专栏收录

456 篇文章

订阅专栏

【高性能计算】

48 篇文章

订阅专栏

本文对比了结构体数组（SoA）与数组结构体（AoS）两种数据布局方式，并通过CUDA编程实践展示了它们在并行计算中的性能差异。文中详细介绍了两种布局在CPU与GPU上的实现方式，以及如何利用CUDA进行有效的内存访问。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.结构体数组（SoA）

/*
 * SoA 结构体数组定义
 */
struct InnerArray
{
    float x[LEN];
    float y[LEN];
};
/*
 * CPU -> SoA 结构体数组的CPU计算形式
 */
void testInnerArrayHost(InnerArray *A, InnerArray *C, const int n)
{
    for (int idx = 0; idx < n; idx++)
    {
        C->x[idx] = A->x[idx] + 10.f;
        C->y[idx] = A->y[idx] + 20.f;
    }

    return;
}
/*
 * GPU -> SoA 结构体数组的CUDA计算模式
 */
__global__ void testInnerArrayDevice(InnerArray *data, InnerArray * result,
                               const int n)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
    {
        float tmpx = data->x[i];
        float tmpy = data->y[i];

        tmpx += 10.f;
        tmpy += 20.f;
        result->x[i] = tmpx;
        result->y[i] = tmpy;
    }
}

2.数组结构体（AoS）

/*
 * AoS
 */
struct innerStruct
{
    float x;
    float y;
};
/*
 * CPU -> AoS
 */
void testInnerStructHost(innerStruct *A, innerStruct *C, const int n)
{
    for (int idx = 0; idx < n; idx++)
    {
        C[idx].x = A[idx].x + 10.f;
        C[idx].y = A[idx].y + 20.f;
    }

    return;
}
/*
 * GPU -> AoS
 */
__global__ void testInnerStructDevice(innerStruct *data, innerStruct * result,
                                const int n)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
    {
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

3.结构体数组（SoA）与数组结构体（AoS）二者的区别

许多并行编程范式，尤其是SIMD（单指令多数据）型范式，更倾向于使用SoA。在CUDA C编程中也普遍倾向于SoA，一维数据元素是为全局内存的有效合并访问而预先准备好的，而相同内存操作引用的同字段元素在存储时时彼此相邻的。

4.给出源代码示例，（《CUDA C编程中文翻译版本》，如有侵权，联系删除）源码网址：点击打开链接

SoA

#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>

/*
 * A simple example of using a structore of arrays to store data on the device.
 * This example is used to study the impact on performance of data layout on the
 * GPU.
 *
 * SoA: contiguous reads for x and y
 */

#define LEN 1<<22

struct InnerArray
{
    float x[LEN];
    float y[LEN];
};

// functions for inner array outer struct
void initialInnerArray(InnerArray *ip,  int size)
{
    for (int i = 0; i < size; i++)
    {
        ip->x[i] = (float)( rand() & 0xFF ) / 100.0f;
        ip->y[i] = (float)( rand() & 0xFF ) / 100.0f;
    }

    return;
}

void testInnerArrayHost(InnerArray *A, InnerArray *C, const int n)
{
    for (int idx = 0; idx < n; idx++)
    {
        C->x[idx] = A->x[idx] + 10.f;
        C->y[idx] = A->y[idx] + 20.f;
    }

    return;
}


void printfHostResult(InnerArray *C, const int n)
{
    for (int idx = 0; idx < n; idx++)
    {
        printf("printout idx %d:  x %f y %f\n", idx, C->x[idx], C->y[idx]);
    }

    return;
}

void checkInnerArray(InnerArray *hostRef, InnerArray *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon)
        {
            match = 0;
            printf("different on x %dth element: host %f gpu %f\n", i,
                   hostRef->x[i], gpuRef->x[i]);
            break;
        }

        if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon)
        {
            match = 0;
            printf("different on y %dth element: host %f gpu %f\n", i,
                   hostRef->y[i], gpuRef->y[i]);
            break;
        }
    }

    if (!match)  printf("Arrays do not match.\n\n");
}

__global__ void testInnerArray(InnerArray *data, InnerArray * result,
                               const int n)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
    {
        float tmpx = data->x[i];
        float tmpy = data->y[i];

        tmpx += 10.f;
        tmpy += 20.f;
        result->x[i] = tmpx;
        result->y[i] = tmpy;
    }
}

__global__ void warmup2(InnerArray *data, InnerArray * result, const int n)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
    {
        float tmpx = data->x[i];
        float tmpy = data->y[i];
        tmpx += 10.f;
        tmpy += 20.f;
        result->x[i] = tmpx;
        result->y[i] = tmpy;
    }
}

// test for array of struct
int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("%s test struct of array at ", argv[0]);
    printf("device %d: %s \n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // allocate host memory
    int nElem = LEN;
    size_t nBytes = sizeof(InnerArray);
    InnerArray     *h_A = (InnerArray *)malloc(nBytes);
    InnerArray *hostRef = (InnerArray *)malloc(nBytes);
    InnerArray *gpuRef  = (InnerArray *)malloc(nBytes);

    // initialize host array
    initialInnerArray(h_A, nElem);
    testInnerArrayHost(h_A, hostRef, nElem);

    // allocate device memory
    InnerArray *d_A, *d_C;
    CHECK(cudaMalloc((InnerArray**)&d_A, nBytes));
    CHECK(cudaMalloc((InnerArray**)&d_C, nBytes));

    // copy data from host to device
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));

    // set up offset for summary
    int blocksize = 128;

    if (argc > 1) blocksize = atoi(argv[1]);

    // execution configuration
    dim3 block (blocksize, 1);
    dim3 grid  ((nElem + block.x - 1) / block.x, 1);

    // kernel 1:
    double iStart = seconds();
    warmup2<<<grid, block>>>(d_A, d_C, nElem);
    CHECK(cudaDeviceSynchronize());
    double iElaps = seconds() - iStart;
    printf("warmup2      <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
           iElaps);
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    checkInnerArray(hostRef, gpuRef, nElem);
    CHECK(cudaGetLastError());

    iStart = seconds();
    testInnerArray<<<grid, block>>>(d_A, d_C, nElem);
    CHECK(cudaDeviceSynchronize());
    iElaps = seconds() - iStart;
    printf("innerarray   <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
           iElaps);
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    checkInnerArray(hostRef, gpuRef, nElem);
    CHECK(cudaGetLastError());

    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_C));
    free(h_A);
    free(hostRef);
    free(gpuRef);

    // reset device
    CHECK(cudaDeviceReset());
    return EXIT_SUCCESS;
}

AoS

#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>

/*
 * A simple example of using an array of structures to store data on the device.
 * This example is used to study the impact on performance of data layout on the
 * GPU.
 *
 * AoS: one contiguous 64-bit read to get x and y (up to 300 cycles)
 */

#define LEN 1<<22

struct innerStruct
{
    float x;
    float y;
};

struct innerArray
{
    float x[LEN];
    float y[LEN];
};

void initialInnerStruct(innerStruct *ip,  int size)
{
    for (int i = 0; i < size; i++)
    {
        ip[i].x = (float)(rand() & 0xFF) / 100.0f;
        ip[i].y = (float)(rand() & 0xFF) / 100.0f;
    }

    return;
}

void testInnerStructHost(innerStruct *A, innerStruct *C, const int n)
{
    for (int idx = 0; idx < n; idx++)
    {
        C[idx].x = A[idx].x + 10.f;
        C[idx].y = A[idx].y + 20.f;
    }

    return;
}

void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i].x - gpuRef[i].x) > epsilon)
        {
            match = 0;
            printf("different on %dth element: host %f gpu %f\n", i,
                    hostRef[i].x, gpuRef[i].x);
            break;
        }

        if (abs(hostRef[i].y - gpuRef[i].y) > epsilon)
        {
            match = 0;
            printf("different on %dth element: host %f gpu %f\n", i,
                    hostRef[i].y, gpuRef[i].y);
            break;
        }
    }

    if (!match)  printf("Arrays do not match.\n\n");
}

__global__ void testInnerStruct(innerStruct *data, innerStruct * result,
                                const int n)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
    {
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

__global__ void warmup(innerStruct *data, innerStruct * result, const int n)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
    {
        innerStruct tmp = data[i];
        tmp.x += 10.f;
        tmp.y += 20.f;
        result[i] = tmp;
    }
}

int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("%s test struct of array at ", argv[0]);
    printf("device %d: %s \n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // allocate host memory
    int nElem = LEN;
    size_t nBytes = nElem * sizeof(innerStruct);
    innerStruct     *h_A = (innerStruct *)malloc(nBytes);
    innerStruct *hostRef = (innerStruct *)malloc(nBytes);
    innerStruct *gpuRef  = (innerStruct *)malloc(nBytes);

    // initialize host array
    initialInnerStruct(h_A, nElem);
    testInnerStructHost(h_A, hostRef, nElem);

    // allocate device memory
    innerStruct *d_A, *d_C;
    CHECK(cudaMalloc((innerStruct**)&d_A, nBytes));
    CHECK(cudaMalloc((innerStruct**)&d_C, nBytes));

    // copy data from host to device
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));

    // set up offset for summaryAU: It is blocksize not offset. Thanks.CZ
    int blocksize = 128;

    if (argc > 1) blocksize = atoi(argv[1]);

    // execution configuration
    dim3 block (blocksize, 1);
    dim3 grid  ((nElem + block.x - 1) / block.x, 1);

    // kernel 1: warmup
    double iStart = seconds();
    warmup<<<grid, block>>>(d_A, d_C, nElem);
    CHECK(cudaDeviceSynchronize());
    double iElaps = seconds() - iStart;
    printf("warmup      <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
           iElaps);
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    checkInnerStruct(hostRef, gpuRef, nElem);
    CHECK(cudaGetLastError());

    // kernel 2: testInnerStruct
    iStart = seconds();
    testInnerStruct<<<grid, block>>>(d_A, d_C, nElem);
    CHECK(cudaDeviceSynchronize());
    iElaps = seconds() - iStart;
    printf("innerstruct <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
           iElaps);
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    checkInnerStruct(hostRef, gpuRef, nElem);
    CHECK(cudaGetLastError());

    // free memories both host and device
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_C));
    free(h_A);
    free(hostRef);
    free(gpuRef);

    // reset device
    CHECK(cudaDeviceReset());
    return EXIT_SUCCESS;
}