cuda编程实例-CSDN博客

本文链接：https://blog.csdn.net/learnererer/article/details/125255941

矢量求和

include "error.cuh"
#include <stdio.h>
#include <cuda_runtime.h>

#define N   100

__global__ void add( int *a, int *b, int *c ) {
    int tid = blockIdx.x;    // this thread handles the data at its thread id
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main( void ) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the GPU
    CHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
    CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
    CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    CHECK( cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );
    CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );

    add<<<N,1>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    CHECK( cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost ) );

    // display the results
    for (int i=0; i<N; i++) {
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    // free the memory allocated on the GPU
    CHECK( cudaFree( dev_a ) );
    CHECK( cudaFree( dev_b ) );
    CHECK( cudaFree( dev_c ) );

    return 0;
}

julia集

在这里插入图片描述

在这里插入代码片

任意长度矢量求和

在这里插入图片描述

#include "error.cuh"
#include <stdio.h>
#include <cuda_runtime.h>
#define N   (33 * 1024)

__global__ void add( int *a, int *b, int *c ) {

//获取线程id，网格大小1D,线程块大小1D,gridDim.x=128，blockDim.x=128
//数据量：33792，线程数：16384

        int tid = threadIdx.x + blockIdx.x * blockDim.x;

//方法1：
        for(int i=0;i<N;i++){
                if(tid==i%blockDim.x*gridDim.x){
                        c[i] = a[i] + b[i];

                }
        }
/*

//方法2：
        while (tid < N) {
                c[tid] = a[tid] + b[tid];
                tid += blockDim.x * gridDim.x;
        }
*/
}
int main( void ) {
    //定义主机指针和设备指针
    int *a, *b, *c;
    int *dev_a, *dev_b, *dev_c;

    //CPU上动态分配内存
    a = (int*)malloc( N * sizeof(int) );
    b = (int*)malloc( N * sizeof(int) );
    c = (int*)malloc( N * sizeof(int) );

    //GPU上动态分配内存
    CHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
    CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
    CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );

    //CPU上为a和b赋值
    for (int i=0; i<N; i++) {
        a[i] = i;
        b[i] = 2 * i;
    }

    //将a和b的数据从主机拷贝到设备
    CHECK( cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );
    CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );
    //调用核函数，设置线程，网格大小1D，线程块大小1D,32
    add<<<128,128>>>( dev_a, dev_b, dev_c );

    //将数据从设备传递到主机上
    CHECK( cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost ) );

    //结果校验
    bool success = true;
    for (int i=0; i<N; i++) {
        if ((a[i] + b[i]) != c[i]) {
            success = false;
        }
    }
    if (success)
        printf( "We did it!\n" );
    else
        printf("Not Pass!Error!!!\n");

    //释放GPU内存空间
    CHECK( cudaFree( dev_a ) );
    CHECK( cudaFree( dev_b ) );
    CHECK( cudaFree( dev_c ) );

    //释放CPU内存空间
    free( a );
    free( b );
    free( c );

    return 0;
}

矩阵处理

在这里插入图片描述

#define N 1000
#define BLOCK_SIZE 32

__managed__ int input_Matrix[N][N];
__managed__ int output_GPU[N][N];
__managed__ int output_CPU[N][N];
__managed__ int output_GPU[N][N];
__managed__ int output_CPU[N][N];
__global__ void kernel(int input_M[N][N], int output_M[N][N])
{

  int x = blockIdx.x * blockDim.x + threadIdx.x;//<<gradx,blockx>>
  int y = blockIdx.y * blockDim.y + threadIdx.y;//<grady,blocky>>
//x维度变化快相当于内层循环
  if(x<N && y<N)
  {
      if(x%2==0 && y%2==0)
      {
          output_M[y][x] = input_M[y][x]*input_M[y][x];
      }
      else
      {
          output_M[y][x] = input_M[y][x]-1;
      }
  }
}
void cpu_kernel(int intput_M[N][N], int output_CPU[N][N])
{
    for(int i=0; i<N; i++)
     {
        for(int j=0; j<N; j++)
        {
            if(j%2==0 && i%2==0)
            {
                output_CPU[i][j] = intput_M[i][j]*intput_M[i][j];
            }
            else
            {
                output_CPU[i][j] = intput_M[i][j]-1;
            }
        }
    }
}

int main(int argc, char const *argv[])
{

    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j)
        {

            input_Matrix[i][j] = rand()%3001;
        }
    }

//CPU
    cudaEvent_t start_cpu,stop_cpu;
    CHECK(cudaEventCreate(&start_cpu));
    CHECK(cudaEventCreate(&start_cpu));
    CHECK(cudaEventCreate(&stop_cpu));
    CHECK(cudaEventRecord(start_cpu));
    printf("\n***********CPU RUN**************\n");
    cpu_kernel(input_Matrix, output_CPU);
    CHECK(cudaEventRecord(stop_cpu));
    CHECK(cudaEventSynchronize(stop_cpu));
    float elapsed_time_cpu;
    CHECK(cudaEventElapsedTime(&elapsed_time_cpu, start_cpu, stop_cpu));
    printf("Time_CPU = %g ms.\n", elapsed_time_cpu);
    CHECK(cudaEventDestroy(start_cpu));
    CHECK(cudaEventDestroy(stop_cpu));

//GPU
    cudaEvent_t start,stop_gpu;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop_gpu));
    CHECK(cudaEventRecord(start));

    unsigned int grid_rows = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    unsigned int grid_cols = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;

    dim3 dimGrid(grid_cols, grid_rows);//网格大小，保证gridDim.x/y*blockDim.y,可以被32整除，且大于数组（x,y）的
数据大小
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    printf("\n***********GPU RUN**************\n");
    kernel<<<dimGrid, dimBlock>>>(input_Matrix, output_GPU);

    CHECK(cudaDeviceSynchronize());
    CHECK(cudaEventRecord(stop_gpu));
    CHECK(cudaEventSynchronize(stop_gpu));

    float elapsed_time_gpu;
    CHECK(cudaEventElapsedTime(&elapsed_time_gpu, start, stop_gpu));
    printf("Time_GPU = %g ms.\n", elapsed_time_gpu);
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(stop_gpu));
    //校验
    printf("\n***********Check result**************\n");
    int ok=1;
    for (int i = 0; i < N; ++i)
    {
        for (int j = 0; j < N; ++j)
        {
            if(fabs(output_GPU[i][j] - output_CPU[i][j])>(1.0e-10))
            {
                ok = 0;
            }

        }
    }


    if(ok)
    {
        printf("Pass!!!\n");
    }
    else
    {
        printf("Error!!!\n");
    }

    // free memory
    return 0;
}