1、CPU的单线程版本
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
int main(int argc, char* argv[])
{
const int N = 200000000;
const int M = sizeof(double) * N;
double * h_x = (double*) malloc(M);
double * h_y = (double*) malloc(M);
double * h_z = (double*) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = a;
h_y[n] = b;
}
auto start_time = steady_clock::now();
for (int n = 0; n < N; ++n)
{
h_z[n] = h_x[n] + h_y[n];
}
auto end_time = steady_clock::now();
auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
cout << "time cost : " << duration_time.count() << " ms" << endl;
return 0;
}
2、CPU的多线程版本(OpenMP实现)
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>
#include <omp.h>
using namespace std;
using namespace std::chrono;
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
int main(int argc, char* argv[])
{
const int N = 200000000;
const int M = sizeof(double) * N;
double * h_x = (double*) malloc(M);
double * h_y = (double*) malloc(M);
double * h_z = (double*) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = a;
h_y[n] = b;
}
auto start_time = steady_clock::now();
//#pragma omp parallel for schedule(dynamic) num_threads(64)
#pragma omp parallel for
for (int n = 0; n < N; ++n)
{
h_z[n] = h_x[n] + h_y[n];
}
auto end_time = steady_clock::now();
auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
cout << "time cost : " << duration_time.count() << " ms" << endl;
return 0;
}
3、CUDA版本
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z);
int main(int argc, char* argv[])
{
const int N = 200000000;
const int M = sizeof(double) * N;
double * h_x = (double*) malloc(M);
double * h_y = (double*) malloc(M);
double * h_z = (double*) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = a;
h_y[n] = b;
}
double *d_x, *d_y, *d_z;
cudaMalloc((void **)&d_x, M);
cudaMalloc((void **)&d_y, M);
cudaMalloc((void **)&d_z, M);
cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);
const int block_size = 64;
const int grid_size = N / block_size;
auto start_time = steady_clock::now();
add<<<grid_size, block_size>>>(d_x, d_y, d_z);
auto end_time = steady_clock::now();
auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
cout << "time cost : " << duration_time.count() << " ms" << endl;
cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
free(h_x);
free(h_y);
free(h_z);
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
return 0;
}
void __global__ add(const double *x, const double *y, double *z)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
z[n] = x[n] + y[n];
}
加入错误检查后的代码
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
const double EPSILON = 1.0e-15;
cudaError_t addWithCuda(double*c, double *a, double*b, unsigned int size);
__global__ void addKernel(double*c, double* a, double* b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int N = 200000000;
const int M = sizeof(double) * N;
double* a = (double*)malloc(M);
double* b = (double*)malloc(M);
double* c = (double*)malloc(M);
for (int n = 0; n < N; ++n)
{
a[n] = 1.23;
b[n] = 2.34;
c[n] = 0.0;
}
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, N);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(double* c, double* a, double* b, unsigned int size)
{
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
const int block_size = 64;
int grid_size = size / block_size;
auto start_time = steady_clock::now();
addKernel<<<grid_size, block_size >>>(dev_c, dev_a, dev_b);
auto end_time = steady_clock::now();
auto duration_time = duration_cast<microseconds>(end_time - start_time) / 1000.0;
cout << "time cost : " << duration_time.count() << " ms" << endl;
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}