- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- #include <iostream>
- using namespace std;
- #define data_size 10000
- #define thread_num 256
- #define block_num 32
- __global__ void kernel(int *d_idata,int * d_odata)
- {
- const int tid=threadIdx.x;
- const int bid=blockIdx.x;
- extern __shared__ int shared[];
- int sum=0;
- for (int i=bid*thread_num+tid;i<data_size;i+=thread_num*block_num)
- {
- shared[tid]+=d_idata[i]*d_idata[i];
- }
- __syncthreads();
- if(tid == 0)
- {
- for(int i = 1; i < thread_num; i++)
- {
- shared[0] += shared[i];
- }
- d_odata[bid] = shared[0];
- }
- }
- int main()
- {
- int h_idata[data_size];
- for (int i=0;i<data_size;i++)
- {
- h_idata[i]=rand()%10;
- }
- int * d_idata;
- int * d_odata;
- cudaMalloc(&d_idata,sizeof(int)*data_size);
- cudaMalloc(&d_odata,sizeof(int)*block_num);
- cudaMemcpy(d_idata,h_idata,sizeof(int)*data_size,cudaMemcpyHostToDevice);
- kernel<<<block_num,thread_num,thread_num*sizeof(int)>>>(d_idata,d_odata);
- int gpu_sum[block_num];
- cudaMemcpy(&gpu_sum,d_odata,sizeof(int)*block_num,cudaMemcpyDeviceToHost);
- cudaFree(d_idata);
- cudaFree(d_odata);
- int final_gpu_sum=0;
- for (int i=0;i<block_num;i++)
- {
- final_gpu_sum+=gpu_sum[i];
- }
- printf("final_gpu_sum=%d\n",final_gpu_sum);
- int cpu_sum = 0;
- for(int i = 0; i < data_size; i++)
- {
- cpu_sum+= h_idata[i] * h_idata[i];
- }
- printf("cpu_sum: %d\n", cpu_sum);
- cin.get();
- }
cuda 1000 32 block 256 threads 2 改进
最新推荐文章于 2022-12-12 17:05:54 发布