/*求两个向量的内积(点积)。两个向量由数组a和数组b给出*/
#include <stdio.h>
#include <string.h>
#define N 100
#define M 128//指定的每个线程块的线程数量
int a[N],b[N],c[N];
__global__ void dot(int* a,int* b,int* c){
int tid = threadIdx.x + blockIdx.x*blockDim.x;//注意M和blockDim没有区别,blockDim就是<<<>>>中传入的第二个参数
__shared__ int cache[M];
int i,index = threadIdx.x,temp=0;
while(tid < N){
temp += a[tid]*b[tid];
tid += blockDim.x*gridDim.x;
}
cache[index] = temp;
__syncthreads();
//以下步骤要求M必须是2的指数
i = blockDim.x/2;
while(i){
if(index<i)
cache[index] += cache[index + i];
__syncthreads();
i/=2;
}
if(index == 0)
c[blockIdx.x] = cache[0];
}
int main(){
int i,block_num,res = 0;
int *d_a,*d_b,*d_c;
for(i = 0;i<N;i++){
a[i] = 1;
b[i] = i;
}
cudaMalloc((void**)&d_a,N*sizeof(int));
cudaMalloc((void**)&d_b,N*sizeof(int));
cudaMalloc((void**)&d_c,N*sizeof(int));
cudaMemcpy(d_a,a,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,b,N*sizeof(int),cudaMemcpyHostToDevice);
block_num = N/M+(N%M==0?0:1);
dot<<<block_num,M>>>(d_a,d_b,d_c);
cudaMemcpy(c,d_c,N*sizeof(int),cudaMemcpyDeviceToHost);
for(i = 0;i<block_num;i++)
res += c[i];
printf("%d\n",res);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
cuda——向量内积
最新推荐文章于 2023-02-02 16:23:06 发布