本文运用两种方法基于CUDA编程求Π,并将CUDA并行和纯CPU串行的运行时间作输出对比。## 标题
CUDA积分法求Π
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
const int N = 1024*1024*64; //积分时划分的份数
const int Threads_Per_block = 256; //block中的线程数
const int Blocks_Per_gird = 64;
__global__ void compute(double *a, double *b, double *c){
__shared__ double cache[Threads_Per_block];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
double x =0.0;
double temp = 0.0;
while(tid < N){
x = *a + (double)(*b-*a)/N*(tid + 0.5);
temp += 4/(1+x*x);
tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i != 0){
if(cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if(cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
int main(){
double a,b;
double* c;
double *d_a,*d_b,*d_c;
double pi_g=0.0;
cudaEvent_t start,stop;
float time;
a = 0.0;
b = 1.0;
clock_t begin,end;
float period;
double pi_c=0.0;
double x;
begin = clock();
for(int i=0;i<N;i++){
x = a + (double)(b-a)/N*(i+0.5);
pi_c += 4/(1+x*x);
}
pi_c *= (double)(b-a)/N;
end = clock();
period = (float)1000*(end - begin)/CLOCKS_PER_SEC;
printf("CPU: pi is %.20lf\n", pi_c);
printf(" cost time: %.6f ms\n", period);
getchar();
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
c = (double*)malloc(Blocks_Per_gird*sizeof(double));
//GPU分配内存
cudaMalloc((void**)&d_a,sizeof(double));
cudaMalloc((void**)&d_b,sizeof(double));
cudaMalloc((void**)&d_c,sizeof(double)*Blocks_Per_gird);
//将a,b从CPU复制到GPU
cudaMemcpy(d_a,&a,sizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,&b,sizeof(double),cudaMemcpyHostToDevice);
//GPU计算
compute<<<Blocks_Per_gird,Threads_Per_block>>>(d_a, d_b ,d_c);
cudaMemcpy(c,d_c,Blocks_Per_gird*sizeof(double),cudaMemcpyDeviceToHost);
for(int i=0;i<Blocks_Per_gird;i++)
{
pi_g += c[i];
}
pi_g *= (double)(b-a)/N;
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
printf("CPU+GPU: pi is %.20lf\n", pi_g);
printf(" cost time: %.6f ms\n", time);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(c);
return 0;
}在这里插入代码片
CUDA幂级数法求Π
在这里插入代#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
const int N = 1024*1024*64;
const int Threads_Per_block = 256;
const int Blocks_Per_gird = 64;
const int N_Per_threads = N/(Threads_Per_block*Blocks_Per_gird);
//globla,CPU调用,GPU执行
__global__ void compute(double *a, double *b, double *c){
__shared__ double cache[Threads_Per_block];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
double x =0.0;
double temp = 0.0;
for(int i=tid*N_Per_threads+1;i<(tid+1)*N_Per_threads+1;i++)
{ if(i%2 == 0)
x = (-1.0)/(2*i-1); //注意是1.0 浮点数,若写成1会出错!!!
else
x = 1.0/(2*i-1);
temp += 4*x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i != 0){
if(cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if(cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
int main(){
double a,b;
double* c;
double *d_a,*d_b,*d_c;
double pi_g=0.0;
cudaEvent_t start,stop;
float time;
a = 0.0;
b = 1.0;
clock_t begin,end;
float period;
double pi_c=0.0;
double x=0.0;
begin = clock();
for(int i=1;i<=N;i++){
if(i%2 == 0)
x = (-1.0)/(2*i-1);
else
x = 1.0/(2*i-1);
pi_c += x;
}
pi_c *= 4;
end = clock();
period = (float)1000*(end - begin)/CLOCKS_PER_SEC;
printf("CPU: pi is %.20lf\n", pi_c);
printf(" cost time: %.6f ms\n", period);
getchar();
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
c = (double*)malloc(Blocks_Per_gird*sizeof(double));
//GPU分配内存
cudaMalloc((void**)&d_a,sizeof(double));
cudaMalloc((void**)&d_b,sizeof(double));
cudaMalloc((void**)&d_c,sizeof(double)*Blocks_Per_gird);
//将a,b从CPU复制到GPU
cudaMemcpy(d_a,&a,sizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,&b,sizeof(double),cudaMemcpyHostToDevice);
//GPU计算
compute<<<Blocks_Per_gird,Threads_Per_block>>>(d_a, d_b ,d_c);
//将计算结果从GPU复制到CPU
cudaMemcpy(c,d_c,Blocks_Per_gird*sizeof(double),cudaMemcpyDeviceToHost);
//CPU归并
for(int i=0;i<Blocks_Per_gird;i++)
{
pi_g += c[i];
}
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
printf("CPU+GPU: pi is %.20lf\n", pi_g);
printf(" cost time: %.6f ms\n", time);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(c);
return 0;
}
码片
分析后得
加速比曲线: