异构计算实验——CUDA编程求Π

本文运用两种方法基于CUDA编程求Π,并将CUDA并行和纯CPU串行的运行时间作输出对比。## 标题

CUDA积分法求Π

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>   
#include <time.h> 
#include <math.h>

const int N = 1024*1024*64;     	//积分时划分的份数
const int Threads_Per_block = 256;	//block中的线程数
const int Blocks_Per_gird = 64;

__global__ void compute(double *a, double *b, double *c){
    __shared__ double cache[Threads_Per_block];

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;
    double x =0.0;
    double temp = 0.0;
    while(tid < N){
        x = *a + (double)(*b-*a)/N*(tid + 0.5);
        temp += 4/(1+x*x);
        tid += blockDim.x * gridDim.x;
    }

    cache[cacheIndex] = temp;
    __syncthreads();

    int i = blockDim.x/2;
    while(i != 0){
        if(cacheIndex < i)
            cache[cacheIndex] += cache[cacheIndex + i];
        __syncthreads();
        i /= 2;
    }

    if(cacheIndex == 0)
        c[blockIdx.x] = cache[0];
}


int main(){
    double a,b;
    double* c;
    double *d_a,*d_b,*d_c;
    double pi_g=0.0;
    cudaEvent_t start,stop;
    float time;
    a = 0.0;
    b = 1.0;

    clock_t begin,end;
    float period;
    double pi_c=0.0;
    double x;
    
    begin = clock();
    for(int i=0;i<N;i++){
		x = a + (double)(b-a)/N*(i+0.5);
        pi_c += 4/(1+x*x);
    }
	pi_c *= (double)(b-a)/N;
    end = clock();
    period = (float)1000*(end - begin)/CLOCKS_PER_SEC;

    printf("CPU: pi is %.20lf\n", pi_c);
    printf("     cost time: %.6f ms\n", period);
    
    getchar();

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,0);

    c = (double*)malloc(Blocks_Per_gird*sizeof(double));
    //GPU分配内存
    cudaMalloc((void**)&d_a,sizeof(double));
    cudaMalloc((void**)&d_b,sizeof(double)); 
    cudaMalloc((void**)&d_c,sizeof(double)*Blocks_Per_gird);   
    
    //将a,b从CPU复制到GPU
    cudaMemcpy(d_a,&a,sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,&b,sizeof(double),cudaMemcpyHostToDevice);

    //GPU计算
    compute<<<Blocks_Per_gird,Threads_Per_block>>>(d_a, d_b ,d_c);

    cudaMemcpy(c,d_c,Blocks_Per_gird*sizeof(double),cudaMemcpyDeviceToHost);

    for(int i=0;i<Blocks_Per_gird;i++)
    {
        pi_g += c[i];
    }
    pi_g *= (double)(b-a)/N;

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time,start,stop);

    printf("CPU+GPU: pi is %.20lf\n", pi_g);
    printf("         cost time: %.6f ms\n", time);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(c);
    return 0;
}在这里插入代码片

CUDA幂级数法求Π

在这里插入代#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>   
#include <time.h> 
#include <math.h>

const int N = 1024*1024*64;     	
const int Threads_Per_block = 256;	
const int Blocks_Per_gird = 64;
const int N_Per_threads = N/(Threads_Per_block*Blocks_Per_gird);

//globla,CPU调用,GPU执行
__global__ void compute(double *a, double *b, double *c){
    __shared__ double cache[Threads_Per_block];

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;
    double x =0.0;
    double temp = 0.0;
    
    for(int i=tid*N_Per_threads+1;i<(tid+1)*N_Per_threads+1;i++)
	{	if(i%2 == 0)
            x = (-1.0)/(2*i-1); //注意是1.0 浮点数,若写成1会出错!!!
        else
            x = 1.0/(2*i-1);
        
        temp += 4*x;
   }
    
    cache[cacheIndex] = temp;
    __syncthreads();

    int i = blockDim.x/2;
    while(i != 0){
        if(cacheIndex < i)
            cache[cacheIndex] += cache[cacheIndex + i];
        __syncthreads();
        i /= 2;
    }

    if(cacheIndex == 0)
        c[blockIdx.x] = cache[0];
}


int main(){
    double a,b;
    double* c;
    double *d_a,*d_b,*d_c;
    double pi_g=0.0;
    cudaEvent_t start,stop;
    float time;
    a = 0.0;
    b = 1.0;

    clock_t begin,end;
    float period;
    double pi_c=0.0;
    double x=0.0;
    
    begin = clock();
    for(int i=1;i<=N;i++){
        if(i%2 == 0)
            x = (-1.0)/(2*i-1);
        else
            x = 1.0/(2*i-1);
        
        pi_c += x;
    }
	pi_c *= 4;
    end = clock();
    period = (float)1000*(end - begin)/CLOCKS_PER_SEC;

    printf("CPU: pi is %.20lf\n", pi_c);
    printf("     cost time: %.6f ms\n", period);
    
    getchar();

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,0);

    c = (double*)malloc(Blocks_Per_gird*sizeof(double));

    //GPU分配内存
    cudaMalloc((void**)&d_a,sizeof(double));
    cudaMalloc((void**)&d_b,sizeof(double)); 
    cudaMalloc((void**)&d_c,sizeof(double)*Blocks_Per_gird);   

    //将a,b从CPU复制到GPU
    cudaMemcpy(d_a,&a,sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,&b,sizeof(double),cudaMemcpyHostToDevice);
    
    //GPU计算
    compute<<<Blocks_Per_gird,Threads_Per_block>>>(d_a, d_b ,d_c);

    //将计算结果从GPU复制到CPU
    cudaMemcpy(c,d_c,Blocks_Per_gird*sizeof(double),cudaMemcpyDeviceToHost);

    //CPU归并
    for(int i=0;i<Blocks_Per_gird;i++)
    {
        pi_g += c[i];
    }

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time,start,stop);

    printf("CPU+GPU: pi is %.20lf\n", pi_g);
    printf("         cost time: %.6f ms\n", time);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(c);
    return 0;
}

码片

分析后得
加速比曲线:
在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值