#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#define N 10
//向量点积运算
__global__ void Dot(int *a, int *b, int *c)
{
__shared__ int temp[N];
temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
__syncthreads();
if (0 == threadIdx.x)
{
int sum = 0;
for (int i; i < N; i++)
{
sum += temp[i];
}
*c = sum;
printf("sum Calculated on Device: %d\n", *c);
}
}
//产生随机整数
void random_ints(int *a, int n)
{
for (int i = 0; i < n; i++)
{
*(a + i) = rand() % 10;
}
}
int main()
{
int *a, *b, *c; //Host变量
int *d_a, *d_b, *d_c; //Device变量
int size = N * sizeof(int);
//为Device变量在Device中分配内存
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, sizeof(int));
//数组初始化
a = (int *)malloc(size);
random_ints(a, N);
b = (int *)malloc(size);
random_ints(b, N);
c = (int *)malloc(sizeof(int));
//打印数组
printf("Array a[N]:\n");
for (int i = 0; i < N; i++) printf("%d ", a[i]);
printf("\n");
printf("Array b[n]:\n");
for (int i = 0; i < N; i++) printf("%d ", b[i]);
printf("\n");
//将Host数据传入Device中
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
//在Device中执行Dot函数
Dot << <1, N >> >(d_a, d_b, d_c);
//将Device数据传入Host中
cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
//在Host上执行向量点积运算
int sumHost = 0;
for (int i = 0; i < N; i++)
{
sumHost += a[i] * b[i];
}
//打印运算结果
printf("sum Calculated on Host=%d\n", sumHost);
printf("Device to Host: a*b=%d\n", *c);
//释放Host变量
free(a);
free(b);
free(c);
//释放Device变量
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}