CUDA 并行计算，课程作业

最新推荐文章于 2024-04-26 15:58:24 发布

fK0pS

最新推荐文章于 2024-04-26 15:58:24 发布

阅读量553

点赞数 25

文章标签：经验分享

本文链接：https://blog.csdn.net/Hodors/article/details/136873541

版权

CUDA 并行计算，课程作业

https://wu-kan.cn/2019/05/28/%E5%B9%B6%E8%A1%8C%E4%B8%8E%E5%88%86%E5%B8%83%E5%BC%8F%E8%AE%A1%E7%AE%97-6/

CUDA练习两则

28 May 2019 11573字 39分 CC BY 4.0 （除特别声明或转载文章外）如果这篇博客帮助到你，可以请我喝一杯咖啡~

CUDA-homework-1

Start from the provided skeleton code error-test.cu that provides some convenience macros for error checking. The macros are defined in the header file error_checks_1.h. Add the missing memory allocations and copies and the kernel launch and check that your code works.

Remember that you can use also cuda-memcheck! If you have time, you can also check what happens if you remove all error checks and do the same tests again.

What happens if you try to launch kernel with too large block size? When do you catch the error if you remove the cudaStreamSynchronize(NULL) call

过大 block size 会抢占计算资源。在 easyHPC 上面交了(vector_add<<<1, 1025>>>(dC, dA, dB, N);)之后，出现了这样的报错

Error: vector_add kernel at 0_4323.cu(86): invalid configuration argument
yhrun: error: gn07: task 1: Exited with exit code 1

删去设备同步的调用之后，报错信息如下。

Error: vector_add kernel at 0_4323.cu(86): invalid configuration argument
yhrun: error: gn07: task 0: Exited with exit code 1

What happens if you try to dereference a pointer to device memory in host code

运行时在解引用的地方报 error 了。

Error: vector_add kernel at 0_4323.cu(85): invalid configuration argument
yhrun: error: gn07: task 0: Exited with exit code 1

What if you try to access host memory from the kernel

编译时报错。

0_4323.cu(48): error: identifier "hb" is undefined in device code

1 error detected in the compilation of "/tmp/tmpxft_0000440e_00000000-11_0_4323.cpp2.i".

slurmd[gn07]: execve(): 0_4323.cu.out: No such file or directory
yhrun: error: gn07: task 0: Exited with exit code 2
rm: cannot remove ‘0_4323.cu.out’: No such file or directory

提交代码

#include <stdio.h>
#include <math.h>
//#include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
// This header provides two helper macros for error checking
// See the exercise skeletons and answers for usage examples.

#ifndef COURSE_UTIL_H_
#define COURSE_UTIL_H_

#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__)
#define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)

inline void __checkErrorFunc(cudaError_t errarg, const char *file,
                             const int line)
{
    if (errarg)
    {
        fprintf(stderr, "Error at %s(%i)\n", file, line);
        exit(EXIT_FAILURE);
    }
}

inline void __checkErrMsgFunc(const char *errstr, const char *file,
                              const int line)
{
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Error: %s at %s(%i): %s\n",
                errstr, file, line, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

#endif

__global__ void vector_add(double *C, const double *A, const double *B, int N)
{
    // Add the kernel code
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Do not try to access past the allocated memory
    if (idx < N)
    {
        C[idx] = A[idx] + B[idx];
    }
}

int main(void)
{
    const int N = 20;
    const int ThreadsInBlock = 128;
    double *dA, *dB, *dC;
    double hA[N], hB[N], hC[N];

    for (int i = 0; i < N; ++i)
    {
        hA[i] = (double)i;
        hB[i] = (double)i * i;
    }

    /*
       Add memory allocations and copies. Wrap your runtime function
       calls with CUDA_CHECK( ) macro
    */
    CUDA_CHECK(cudaMalloc((void **)&dA, sizeof(double) * N));
    CUDA_CHECK(cudaMalloc((void **)&dB, sizeof(double) * N));
    CUDA_CHECK(cudaMalloc((void **)&dC, sizeof(double) * N));
    CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double) * N, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double) * N, cudaMemcpyHostToDevice));
    //#error Add the remaining memory allocations and copies

    // Note the maximum size of threads in a block
    dim3 grid, threads;

     Add the kernel call here
    vector_add<<<1, 32>>>(dC, dA, dB, N);
    //#error Add the CUDA kernel call

    // Here we add an explicit synchronization so that we catch errors
    // as early as possible. Don't do this in production code!
    cudaStreamSynchronize(NULL);
    CHECK_ERROR_MSG("vector_add kernel");

     Copy back the results and free the device memory
    CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double) * N, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaFree(dA));
    CUDA_CHECK(cudaFree(dB));
    CUDA_CHECK(cudaFree(dC));
    //#error Copy back the results and free the allocated memory

    for (int i = 0; i < N; i++)
        printf("%5.1f\n", hC[i]);

    return 0;
}

CUDA-homework-2

In this exercise we will implement a Jacobi iteration which is a very simple finite-difference scheme. Familiarize yourself with the provided skeleton. Then implement following things:

Write the missing CUDA kernel sweepGPU that implements the same algorithm as the sweepCPU function. Check that the reported averate difference is in the order of the numerical accuracy.

Experiment with different grid and block sizes and compare the execution times.

代码

//err_checker.h
// This header provides two helper macros for error checking
// See the exercise skeletons and answers for usage examples.

#ifndef COURSE_UTIL_H_
#define COURSE_UTIL_H_

#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__)
#define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)

inline void __checkErrorFunc(cudaError_t errarg, const char *file,
                             const int line)
{
    if (errarg)
    {
        fprintf(stderr, "Error at %s(%i)\n", file, line);
        exit(EXIT_FAILURE);
    }
}

inline void __checkErrMsgFunc(const char *errstr, const char *file,
                              const int line)
{
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Error: %s at %s(%i): %s\n",
                errstr, file, line, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

#endif
//jacbi.h
#ifndef EX3_H_
#define EX3_H_

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/zip_iterator.h>

// Helper function prototypes
double compareArrays(const double *a, const double *b, int N);
double diffCPU(const double *a, const double *b, int N);
void sweepCPU(double *phi, const double *phiPrev,
              const double *source, double h2, int N);

/* -------------------------------------------------------------------------
   EXTRACURRICULAR ACTIVITIES

   This part provides the reduction operation (in this case summation of
   difference of two arrays) using thrust library. Thrust mimics the
   syntax and design of standard template library (STL) of C++. Thrust is
   also a part of CUDA 4 SDK.
   More information can be found from thrust home page
   http://code.google.com/p/thrust/
   ----------------------------------------------------------------------- */

template <typename T>
class square_diff_thr : public thrust::unary_function<thrust::tuple<T, T>, T>
{
public:
    __host__ __device__
        T
        operator()(const thrust::tuple<T, T> &x) const
    {
        return (thrust::get<1>(x) - thrust::get<0>(x)) *
               (thrust::get<1>(x) - thrust::get<0>(x));
    }
};

template <typename T>
class square_thr : public thrust::unary_function<T, T>
{
public:
    __host__ __device__
        T
        operator()(const T &x) const
    {
        return x * x;
    }
};

template <typename T>
T diffGPU(T *A_d, T *B_d, int N)
{
    typedef thrust::device_ptr<T> FloatIterator;
    typedef thrust::tuple<FloatIterator, FloatIterator> IteratorTuple;
    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;

    thrust::device_ptr<T> A_ptr(A_d);
    thrust::device_ptr<T> B_ptr(B_d);

    ZipIterator first =
        thrust::make_zip_iterator(thrust::make_tuple(A_ptr, B_ptr));
    ZipIterator last =
        thrust::make_zip_iterator(thrust::make_tuple(A_ptr + N * N,
                                                     B_ptr + N * N));

    T a1 = thrust::transform_reduce(first, last, square_diff_thr<T>(),
                                    static_cast<T>(0), thrust::plus<T>());
    T a2 = thrust::transform_reduce(B_ptr, B_ptr + N * N,
                                    square_thr<T>(), static_cast<T>(0),
                                    thrust::plus<T>());

    return sqrt(a1 / a2);
}

#endif // EX3_H_

//jacobi.cu
#include <time.h>
#include <stdio.h>
//#include "jacobi.h"
//#include "error_checks.h"

// Change this to 0 if CPU reference result is not needed
#define COMPUTE_CPU_REFERENCE 1
#define MAX_ITERATIONS 3000

// CPU kernel
void sweepCPU(double *phi, const double *phiPrev, const double *source,
              double h2, int N)
{
    int i, j;
    int index, i1, i2, i3, i4;

    for (j = 1; j < N - 1; j++)
    {
        for (i = 1; i < N - 1; i++)
        {
            index = i + j * N;
            i1 = (i - 1) + j * N;
            i2 = (i + 1) + j * N;
            i3 = i + (j - 1) * N;
            i4 = i + (j + 1) * N;
            phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] +
                                 phiPrev[i3] + phiPrev[i4] -
                                 h2 * source[index]);
        }
    }
}

// GPU kernel
__global__ void sweepGPU(double *phi, const double *phiPrev, const double *source, double h2, int N)
{
    // #error Add here the GPU version of the update routine (see sweepCPU above)
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i > 0 && j > 0 && i < N - 1 && j < N - 1)
    { // be careful!
        int index = i + j * N;
        int i1 = (i - 1) + j * N;
        int i2 = (i + 1) + j * N;
        int i3 = i + (j - 1) * N;
        int i4 = i + (j + 1) * N;
        phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + phiPrev[i3] + phiPrev[i4] - h2 * source[index]);
    }
}

double compareArrays(const double *a, const double *b, int N)
{
    double error = 0.0;
    int i;
    for (i = 0; i < N * N; i++)
    {
        error += fabs(a[i] - b[i]);
    }
    return error / (N * N);
}

double diffCPU(const double *phi, const double *phiPrev, int N)
{
    int i;
    double sum = 0;
    double diffsum = 0;

    for (i = 0; i < N * N; i++)
    {
        diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
        sum += phi[i] * phi[i];
    }

    return sqrt(diffsum / sum);
}

int main()
{
    clock_t t1, t2; // Structs for timing
    const int N = 512;
    double h = 1.0 / (N - 1);
    int iterations;
    const double tolerance = 5e-4; // Stopping condition
    int i, j, index;

    const int blocksize = 16;

    double *phi = new double[N * N];
    double *phiPrev = new double[N * N];
    double *source = new double[N * N];
    double *phi_cuda = new double[N * N];

    double *phi_d, *phiPrev_d, *source_d;
    // Size of the arrays in bytes
    const int size = N * N * sizeof(double);
    double diff;

    // Source initialization
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
        {
            double x, y;
            x = (i - N / 2) * h;
            y = (j - N / 2) * h;
            index = j + i * N;
            if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
                source[index] = 1e10 * h * h;
            else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
                source[index] = -1e10 * h * h;
            else
                source[index] = 0.0;
        }
    }

    CUDA_CHECK(cudaMalloc((void **)&source_d, size));
    CUDA_CHECK(cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice));

    // Reset values to zero
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
        {
            index = j + i * N;
            phi[index] = 0.0;
            phiPrev[index] = 0.0;
        }
    }

    CUDA_CHECK(cudaMalloc((void **)&phi_d, size));
    CUDA_CHECK(cudaMalloc((void **)&phiPrev_d, size));
    CUDA_CHECK(cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice));

    // CPU version
    if (COMPUTE_CPU_REFERENCE)
    {
        t1 = clock();

        // Do sweeps untill difference is under the tolerance
        diff = tolerance * 2;
        iterations = 0;
        while (diff > tolerance && iterations < MAX_ITERATIONS)
        {
            sweepCPU(phiPrev, phi, source, h * h, N);
            sweepCPU(phi, phiPrev, source, h * h, N);

            iterations += 2;
            if (iterations % 100 == 0)
            {
                diff = diffCPU(phi, phiPrev, N);
                printf("%d %g\n", iterations, diff);
            }
        }
        t2 = clock();
        printf("CPU Jacobi: %g ms, %d iterations\n",
               t2 - t1,
               iterations);
    }

    // GPU version

    dim3 dimBlock(blocksize, blocksize);
    dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize);

    //do sweeps until diff under tolerance
    diff = tolerance * 2;
    iterations = 0;

    t1 = clock();

    while (diff > tolerance && iterations < MAX_ITERATIONS)
    {
        // See above how the CPU update kernel is called
        // and implement similar calling sequence for the GPU code

         Add routines here
        sweepGPU<<<dimGrid, dimBlock>>>(phiPrev_d, phi_d, source_d, h * h, N);
        sweepGPU<<<dimGrid, dimBlock>>>(phi_d, phiPrev_d, source_d, h * h, N);
        //#error Add GPU kernel calls here (see CPU version above)

        iterations += 2;

        if (iterations % 100 == 0)
        {
            // diffGPU is defined in the header file, it uses
            // Thrust library for reduction computation
            diff = diffGPU<double>(phiPrev_d, phi_d, N);
            CHECK_ERROR_MSG("Difference computation");
            printf("%d %g\n", iterations, diff);
        }
    }

     Add here the routine to copy back the results
    CUDA_CHECK(cudaMemcpy(phi, phi_d, size, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(phiPrev, phiPrev_d, size, cudaMemcpyDeviceToHost));
    //#error Copy back the results

    t2 = clock();
    printf("GPU Jacobi: %g ms, %d iterations\n",
           t2 - t1,
           iterations);

     Add here the clean up code for all allocated CUDA resources
    CUDA_CHECK(cudaFree(phi_d));
    CUDA_CHECK(cudaFree(phiPrev_d));
    CUDA_CHECK(cudaFree(source_d));
    //#error Add here the clean up code

    if (COMPUTE_CPU_REFERENCE)
    {
        printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
    }

    delete[] phi;
    delete[] phi_cuda;
    delete[] phiPrev;
    delete[] source;

    return EXIT_SUCCESS;
}

class="utterances-frame" title="Comments" scrolling="no" src="https://utteranc.es/utterances.html?async=async&repo=wu-kan%2Futterances-storage&src=https%3A%2F%2Futteranc.es%2Fclient.js&issue-term=url&theme=github-light&crossorigin=anonymous&type=text%2Fjavascript&url=https%3A%2F%2Fwu-kan.cn%2F2019%2F05%2F28%2F%25E5%25B9%25B6%25E8%25A1%258C%25E4%25B8%258E%25E5%2588%2586%25E5%25B8%2583%25E5%25BC%258F%25E8%25AE%25A1%25E7%25AE%2597-6%2F&origin=https%3A%2F%2Fwu-kan.cn&pathname=2019%2F05%2F28%2F%25E5%25B9%25B6%25E8%25A1%258C%25E4%25B8%258E%25E5%2588%2586%25E5%25B8%2583%25E5%25BC%258F%25E8%25AE%25A1%25E7%25AE%2597-6%2F&title=CUDA%E7%BB%83%E4%B9%A0%E4%B8%A4%E5%88%99+%C2%B7+wu-kan&description=CUDA-homework-1&og%3Atitle=CUDA%E7%BB%83%E4%B9%A0%E4%B8%A4%E5%88%99&session=" loading="lazy" style="box-sizing: border-box; color-scheme: light; left: 0px; right: 0px; width: 1px; min-width: 100%; max-width: 100%; height: 268px; border: 0px;">