cuda学习笔记（一）数组相加

mj412828668

已于 2024-09-10 09:50:49 修改

阅读量143

点赞数 2

文章标签：学习笔记 c++ CUDA

于 2024-09-09 20:48:02 首次发布

本文链接：https://blog.csdn.net/mj412828668/article/details/142068815

版权

1、CPU的单线程版本

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>


using namespace std;
using namespace std::chrono;

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;


int main(int argc, char* argv[])
{
	const int N = 200000000;
	const int M = sizeof(double) * N;
	double * h_x = (double*) malloc(M);
	double * h_y = (double*) malloc(M);
	double * h_z = (double*) malloc(M);

	for (int n = 0; n < N; ++n)
	{
		h_x[n] = a;
		h_y[n] = b;
	}

	auto start_time = steady_clock::now();
	for (int n = 0; n < N; ++n)
	{
		h_z[n] = h_x[n] + h_y[n];
	}
	auto end_time = steady_clock::now();
	auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
	cout << "time cost : " << duration_time.count() << " ms" << endl;

	return 0;
}

2、CPU的多线程版本（OpenMP实现）

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>
#include <omp.h>


using namespace std;
using namespace std::chrono;

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;


int main(int argc, char* argv[])
{
	const int N = 200000000;
	const int M = sizeof(double) * N;
	double * h_x = (double*) malloc(M);
	double * h_y = (double*) malloc(M);
	double * h_z = (double*) malloc(M);

	for (int n = 0; n < N; ++n)
	{
		h_x[n] = a;
		h_y[n] = b;
	}

	auto start_time = steady_clock::now();
    //#pragma omp parallel for schedule(dynamic) num_threads(64)
	#pragma omp parallel for
	for (int n = 0; n < N; ++n)
	{
		h_z[n] = h_x[n] + h_y[n];
	}
	auto end_time = steady_clock::now();
	auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
	cout << "time cost : " << duration_time.count() << " ms" << endl;

	return 0;
}

3、CUDA版本

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>


using namespace std;
using namespace std::chrono;

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z);


int main(int argc, char* argv[])
{
	const int N = 200000000;
	const int M = sizeof(double) * N;
	double * h_x = (double*) malloc(M);
	double * h_y = (double*) malloc(M);
	double * h_z = (double*) malloc(M);

	for (int n = 0; n < N; ++n)
	{
		h_x[n] = a;
		h_y[n] = b;
	}

    double *d_x, *d_y, *d_z;
	cudaMalloc((void **)&d_x, M);
	cudaMalloc((void **)&d_y, M);
	cudaMalloc((void **)&d_z, M);
	cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
	cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);
	
	const int block_size = 64;
	const int grid_size = N / block_size;
	
	auto start_time = steady_clock::now();
	add<<<grid_size, block_size>>>(d_x, d_y, d_z);
	auto end_time = steady_clock::now();
	auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
	cout << "time cost : " << duration_time.count() << " ms" << endl;

    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
	
	free(h_x);
	free(h_y);
	free(h_z);
	cudaFree(d_x);
	cudaFree(d_y);
	cudaFree(d_z);
	return 0;
}


void __global__ add(const double *x, const double *y, double *z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
	z[n] = x[n] + y[n];
}

加入错误检查后的代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>


using namespace std;
using namespace std::chrono;

const double EPSILON = 1.0e-15;

cudaError_t addWithCuda(double*c, double *a, double*b, unsigned int size);

__global__ void addKernel(double*c, double* a, double* b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int N = 200000000;
    const int M = sizeof(double) * N;
    double* a = (double*)malloc(M);
    double* b = (double*)malloc(M);
    double* c = (double*)malloc(M);

    for (int n = 0; n < N; ++n)
    {
        a[n] = 1.23;
        b[n] = 2.34;
        c[n] = 0.0;
    }

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, N);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(double* c, double* a, double* b, unsigned int size)
{
    double *dev_a = 0;
    double *dev_b = 0;
    double *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    const int block_size = 64;
    int grid_size = size / block_size;
    auto start_time = steady_clock::now();
    addKernel<<<grid_size, block_size >>>(dev_c, dev_a, dev_b);
    auto end_time = steady_clock::now();
    auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
    cout << "time cost : " << duration_time.count() << " ms" << endl;

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(double), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}