#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
__global__ void func1(int* data1, int* data2, int* data3);
__device__ void func2(int *data2, int i);
#define ARRAY_SIZE 10
#define ARRAY_SIZE_IN_BYTES (sizeof(int)*(ARRAY_SIZE)) //定义了一个int型长度128的数组
int cpu_data1[ARRAY_SIZE];
int cpu_data2[ARRAY_SIZE];
int cpu_data3[ARRAY_SIZE];
int main()
{
const int num_blocks = 1;
const int num_threads = ARRAY_SIZE / num_blocks;
int * gpu_data1;
int * gpu_data2;
int * gpu_data3;
cudaMalloc((void **)&gpu_data1, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_data2, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_data3, ARRAY_SIZE_IN_BYTES);
func1 << <num_blocks, num_threads >> >(gpu_data1, gpu_data2, gpu_data3);
cudaMemcpy(cpu_data1, gpu_data1, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_data2, gpu_data2, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_data3, gpu_data3, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaFree(gpu_data1);
cudaFree(gpu_data2);
cudaFree(gpu_data3);
for (int i = 0; i < ARRAY_SIZE; i++)
{
cout << cpu_data1[i] << " " << cpu_data2[i] << " " << cpu_data3[i] << endl;
}
cin.get();
return 0;
}
__device__ int A = 10;
__constant__ int static C=10;
__shared__ unsigned int B;
__global__ void func1(int* data1, int* data2, int* data3)
{
int i = (blockIdx.x*blockDim.x) + threadIdx.x;
if (i % 2)
{
A = A + 1;
data1[i] = A;
B = B + 10;
data2[i] = B;
data3[i] &= C;
}
}
__device__ void func2(int *data2, int i)
{
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
__global__ void func1(int* data1, int* data2, int* data3);
__device__ void func2(int *data2, int i);
#define ARRAY_SIZE 10
#define ARRAY_SIZE_IN_BYTES (sizeof(int)*(ARRAY_SIZE)) //定义了一个int型长度128的数组
int cpu_data1[ARRAY_SIZE];
int cpu_data2[ARRAY_SIZE];
int cpu_data3[ARRAY_SIZE];
int main()
{
const int num_blocks = 1;
const int num_threads = ARRAY_SIZE / num_blocks;
int * gpu_data1;
int * gpu_data2;
int * gpu_data3;
cudaMalloc((void **)&gpu_data1, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_data2, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_data3, ARRAY_SIZE_IN_BYTES);
func1 << <num_blocks, num_threads >> >(gpu_data1, gpu_data2, gpu_data3);
cudaMemcpy(cpu_data1, gpu_data1, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_data2, gpu_data2, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_data3, gpu_data3, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaFree(gpu_data1);
cudaFree(gpu_data2);
cudaFree(gpu_data3);
for (int i = 0; i < ARRAY_SIZE; i++)
{
cout << cpu_data1[i] << " " << cpu_data2[i] << " " << cpu_data3[i] << endl;
}
cin.get();
return 0;
}
__device__ int A = 10;
__constant__ int static C=10;
__shared__ unsigned int B;
__global__ void func1(int* data1, int* data2, int* data3)
{
int i = (blockIdx.x*blockDim.x) + threadIdx.x;
if (i % 2)
{
A = A + 1;
data1[i] = A;
B = B + 10;
data2[i] = B;
data3[i] &= C;
}
}
__device__ void func2(int *data2, int i)
{
}