矢量求和
include "error.cuh"
#include <stdio.h>
#include <cuda_runtime.h>
#define N 100
__global__ void add( int *a, int *b, int *c ) {
int tid = blockIdx.x; // this thread handles the data at its thread id
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
CHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );
// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
a[i] = -i;
b[i] = i * i;
}
// copy the arrays 'a' and 'b' to the GPU
CHECK( cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice ) );
CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice ) );
add<<<N,1>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
CHECK( cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost ) );
// display the results
for (int i=0; i<N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
// free the memory allocated on the GPU
CHECK( cudaFree( dev_a ) );
CHECK( cudaFree( dev_b ) );
CHECK( cudaFree( dev_c ) );
return 0;
}
julia集
在这里插入代码片
任意长度矢量求和
#include "error.cuh"
#include <stdio.h>
#include <cuda_runtime.h>
#define N (33 * 1024)
__global__ void add( int *a, int *b, int *c ) {
//获取线程id,网格大小1D,线程块大小1D,gridDim.x=128,blockDim.x=128
//数据量:33792,线程数:16384
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//方法1:
for(int i=0;i<N;i++){
if(tid==i%blockDim.x*gridDim.x){
c[i] = a[i] + b[i];
}
}
/*
//方法2:
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
*/
}
int main( void ) {
//定义主机指针和设备指针
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
//CPU上动态分配内存
a = (int*)malloc( N * sizeof(int) );
b = (int*)malloc( N * sizeof(int) );
c = (int*)malloc( N * sizeof(int) );
//GPU上动态分配内存
CHECK( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
CHECK( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
CHECK( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );
//CPU上为a和b赋值
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = 2 * i;
}
//将a和b的数据从主机拷贝到设备
CHECK( cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice ) );
CHECK( cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice ) );
//调用核函数,设置线程,网格大小1D,线程块大小1D,32
add<<<128,128>>>( dev_a, dev_b, dev_c );
//将数据从设备传递到主机上
CHECK( cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost ) );
//结果校验
bool success = true;
for (int i=0; i<N; i++) {
if ((a[i] + b[i]) != c[i]) {
success = false;
}
}
if (success)
printf( "We did it!\n" );
else
printf("Not Pass!Error!!!\n");
//释放GPU内存空间
CHECK( cudaFree( dev_a ) );
CHECK( cudaFree( dev_b ) );
CHECK( cudaFree( dev_c ) );
//释放CPU内存空间
free( a );
free( b );
free( c );
return 0;
}
矩阵处理
#define N 1000
#define BLOCK_SIZE 32
__managed__ int input_Matrix[N][N];
__managed__ int output_GPU[N][N];
__managed__ int output_CPU[N][N];
__managed__ int output_GPU[N][N];
__managed__ int output_CPU[N][N];
__global__ void kernel(int input_M[N][N], int output_M[N][N])
{
int x = blockIdx.x * blockDim.x + threadIdx.x;//<<gradx,blockx>>
int y = blockIdx.y * blockDim.y + threadIdx.y;//<grady,blocky>>
//x维度变化快相当于内层循环
if(x<N && y<N)
{
if(x%2==0 && y%2==0)
{
output_M[y][x] = input_M[y][x]*input_M[y][x];
}
else
{
output_M[y][x] = input_M[y][x]-1;
}
}
}
void cpu_kernel(int intput_M[N][N], int output_CPU[N][N])
{
for(int i=0; i<N; i++)
{
for(int j=0; j<N; j++)
{
if(j%2==0 && i%2==0)
{
output_CPU[i][j] = intput_M[i][j]*intput_M[i][j];
}
else
{
output_CPU[i][j] = intput_M[i][j]-1;
}
}
}
}
int main(int argc, char const *argv[])
{
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j)
{
input_Matrix[i][j] = rand()%3001;
}
}
//CPU
cudaEvent_t start_cpu,stop_cpu;
CHECK(cudaEventCreate(&start_cpu));
CHECK(cudaEventCreate(&start_cpu));
CHECK(cudaEventCreate(&stop_cpu));
CHECK(cudaEventRecord(start_cpu));
printf("\n***********CPU RUN**************\n");
cpu_kernel(input_Matrix, output_CPU);
CHECK(cudaEventRecord(stop_cpu));
CHECK(cudaEventSynchronize(stop_cpu));
float elapsed_time_cpu;
CHECK(cudaEventElapsedTime(&elapsed_time_cpu, start_cpu, stop_cpu));
printf("Time_CPU = %g ms.\n", elapsed_time_cpu);
CHECK(cudaEventDestroy(start_cpu));
CHECK(cudaEventDestroy(stop_cpu));
//GPU
cudaEvent_t start,stop_gpu;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop_gpu));
CHECK(cudaEventRecord(start));
unsigned int grid_rows = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);//网格大小,保证gridDim.x/y*blockDim.y,可以被32整除,且大于数组(x,y)的
数据大小
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
printf("\n***********GPU RUN**************\n");
kernel<<<dimGrid, dimBlock>>>(input_Matrix, output_GPU);
CHECK(cudaDeviceSynchronize());
CHECK(cudaEventRecord(stop_gpu));
CHECK(cudaEventSynchronize(stop_gpu));
float elapsed_time_gpu;
CHECK(cudaEventElapsedTime(&elapsed_time_gpu, start, stop_gpu));
printf("Time_GPU = %g ms.\n", elapsed_time_gpu);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop_gpu));
//校验
printf("\n***********Check result**************\n");
int ok=1;
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < N; ++j)
{
if(fabs(output_GPU[i][j] - output_CPU[i][j])>(1.0e-10))
{
ok = 0;
}
}
}
if(ok)
{
printf("Pass!!!\n");
}
else
{
printf("Error!!!\n");
}
// free memory
return 0;
}
矩阵转置
点积