#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define data_size 1026
#define thread_num 256
using namespace std;
__global__ static void sumOfSquares(int *d_idata,int *d_odata)
{
const int tid=threadIdx.x;
printf("%d ",tid);
d_odata[tid]=d_idata[tid]*d_idata[tid];
__syncthreads();
}
int main()
{
int h_idata [data_size];
for (int i = 0; i < data_size; i ++)
{
h_idata[i] = i;//rand()%10;
}
int * d_idata;
int * d_odata;
cudaMalloc((void **)&d_idata,sizeof(int)*data_size);
cudaMalloc((void **)&d_odata,sizeof(int)*data_size);
cudaMemcpy(d_idata,h_idata,sizeof(int)*data_size,cudaMemcpyHostToDevice);
sumOfSquares<<<1,data_size,0>>>(d_idata,d_odata);
int gpu_sum[data_size];
cudaMemcpy(&gpu_sum,d_odata,sizeof(int)*data_size,
cuda 简单数组运算
最新推荐文章于 2022-07-09 18:19:17 发布