#include <algorithm>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <memory>
#include <random>
#include <thread>
namespace
{
constexpr int ADD_COUNT = 1 << 19;
}
__global__ void dev_add(int *a, int *b);
int main()
{
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, 0);
const int BLOCK_PER_GRID = devProp.multiProcessorCount;
const int THREAD_PER_BLOCK = devProp.maxThreadsPerBlock;
const int TOTAL_COUNT = BLOCK_PER_GRID * THREAD_PER_BLOCK;
const int BUFFER_SIZE = TOTAL_COUNT * sizeof(int) + 1024;
auto a{std::make_unique<int[]>(TOTAL_COUNT)}, b{std::make_unique<int[]>(TOTAL_COUNT)},
c{std::make_unique<int[]>(TOTAL_COUNT)};
std::default_random_engine gen{
unsigned(std::random_device{}() + std::chrono::steady_clock::now().time_since_epoch().count())};
std::for_each(a.get(), a.get() + TOTAL_COUNT, [&gen](int &cur) { cur = gen() % 100; });
std::for_each(b.get(), b.get() + TOTAL_COUNT, [&gen](int &cur) { cur = gen() % 100; });
std::copy_n(b.get(), TOTAL_COUNT, c.get());
std::cout << "Total: " << TOTAL_COUNT << '\n';
std::cout << "GPU Start\n";
{
auto t0 = std::chrono::steady_clock::now();
int *rd_a, *rd_b;
cudaMalloc(&rd_a, BUFFER_SIZE);
cudaMalloc(&rd_b, BUFFER_SIZE);
std::unique_ptr<int[], std::function<void(int *)>> d_a{rd_a, cudaFree}, d_b{rd_b, cudaFree};
cudaMemcpy(d_a.get(), a.get(), BUFFER_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_b.get(), b.get(), BUFFER_SIZE, cudaMemcpyHostToDevice);
dim3 blkd(BLOCK_PER_GRID), thrd(THREAD_PER_BLOCK);
dev_add<<<blkd, thrd>>>(d_a.get(), d_b.get());
cudaMemcpy(b.get(), d_b.get(), BUFFER_SIZE, cudaMemcpyDeviceToHost);
auto t1 = std::chrono::steady_clock::now();
std::cout << "GPU: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count() << '\n';
}
std::cout << "\nCPU Start\n";
{
auto t0 = std::chrono::steady_clock::now();
const int hc = std::thread::hardware_concurrency();
const int per = (TOTAL_COUNT + hc - 1) / hc;
auto ths{std::make_unique<std::thread[]>(hc)};
for (int i = 0; i < hc; i++)
{
ths[i] = std::thread{[i, &per, &TOTAL_COUNT, &c, &a]() {
for (int j = i * per, up = std::min((i + 1) * per, TOTAL_COUNT); j < up; ++j)
{
int &cc = c[j], ca = a[j];
for (int k = 0; k < ADD_COUNT; k++)
{
cc = (cc + ca) % 100;
}
}
}};
}
for (int i = 0; i < hc; i++)
{
ths[i].join();
}
auto t1 = std::chrono::steady_clock::now();
std::cout << "CPU: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count() << '\n';
}
auto mis = std::mismatch(b.get(), b.get() + TOTAL_COUNT, c.get());
if (mis.first != b.get() + TOTAL_COUNT)
std::cout << "\nfirst mismatch postion: " << mis.first - b.get() << '\n';
else
std::cout << "\nmatched\n";
return 0;
}
__global__ void dev_add(int *a, int *b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int &cb = b[idx], &ca = a[idx];
for (int i = 0; i < ADD_COUNT; ++i)
{
cb = (cb + ca) % 100;
}
}
CUDA编程:GPU并行与CPU并行的效率对比
最新推荐文章于 2023-10-22 10:01:09 发布
这篇博客通过一个简单的加法运算实例,展示了如何使用CUDA在GPU上进行并行计算,并与多线程CPU计算进行了性能比较。作者首先初始化了两个大小为GPU最大线程数的整数数组,并填充随机数。接着,使用CUDA的内核函数在设备上执行大量加法操作,并将结果复制回主机。最后,通过CPU的多线程实现同样的计算任务,比较了GPU与CPU的运行时间。结果显示了GPU在并行计算上的优势。
摘要由CSDN通过智能技术生成