#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
#define u32 unsigned int
#define ARRAY_SIZE_X 2
#define ARRAY_SIZE_Y 3
u32 cpu_mat_input1[ARRAY_SIZE_X][ARRAY_SIZE_Y];
u32 cpu_mat_input2[ARRAY_SIZE_Y][ARRAY_SIZE_X];
u32 cpu_mat_result[ARRAY_SIZE_X][ARRAY_SIZE_X];
__global__ void func2(
u32 * gpu_mat_input1,
u32 * gpu_mat_input2,
u32 * gpu_mat_result,
u32 Width,
u32 Width1,
u32 Width2);
u32 main(void)
{
for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
{
for (u32 x = 0; x < ARRAY_SIZE_X; x++)
{
cpu_mat_input1[x][y] = rand() % 8;
cpu_mat_input2[y][x] = rand() % 8;
}
}
dim3 threads_rect(ARRAY_SIZE_X, ARRAY_SIZE_X);
dim3 blocks_rect = 1;
u32 * gpu_mat_input1;
u32 * gpu_mat_input2;
u32 * gpu_mat_result;
cudaMalloc((void **)& gpu_mat_input1, (ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(u32)));
cudaMalloc((void **)& gpu_mat_input2, (ARRAY_SIZE_Y)*(ARRAY_SIZE_X)*(sizeof(u32)));
cudaMalloc((void **)& gpu_mat_result, (ARRAY_SIZE_X)*(ARRAY_SIZE_X)*(sizeof(u32)));
cudaMemcpy(gpu_mat_input1, cpu_mat_input1, (ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(u32)), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_mat_input2, cpu_mat_input2, (ARRAY_SIZE_Y)*(ARRAY_SIZE_X)*(sizeof(u32)), cudaMemcpyHostToDevice);
func2 << <blocks_rect, threads_rect >> >(
gpu_mat_input1,
gpu_mat_input2,
gpu_mat_result,
ARRAY_SIZE_X,
ARRAY_SIZE_Y,
ARRAY_SIZE_X);
cudaMemcpy(cpu_mat_result, gpu_mat_result, (ARRAY_SIZE_X)*(ARRAY_SIZE_X)*(sizeof(u32)), cudaMemcpyDeviceToHost);
cudaFree(gpu_mat_input1);
cudaFree(gpu_mat_input2);
cudaFree(gpu_mat_result);
cout << "input1:" << endl;
for (u32 x = 0; x < ARRAY_SIZE_X; x++)
{
for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
{
cout << cpu_mat_input1[x][y] << " ";
}
cout << endl;
}
cout << "input2:" << endl;
for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
{
for (u32 x = 0; x < ARRAY_SIZE_X; x++)
{
cout << cpu_mat_input2[y][x] << " ";
}
cout << endl;
}
cout << "result:" << endl;
for (u32 x = 0; x < ARRAY_SIZE_X; x++)
{
for (u32 y = 0; y < ARRAY_SIZE_X; y++)
{
cout << cpu_mat_result[x][y] << " ";
}
cout << endl;
}
printf("press any key to continue\n");
cin.get();
return 0;
}
__global__ void func2(
u32 * gpu_mat_input1,
u32 * gpu_mat_input2,
u32 * gpu_mat_result,
u32 Width,
u32 Width1,
u32 Width2)
{
u32 idx = threadIdx.x;
u32 idy = threadIdx.y;
u32 Pvalue = 0;
for (int k = 0; k < Width1; ++k)
{
u32 a = gpu_mat_input1[idy*Width1 + k];
u32 b = gpu_mat_input2[k*Width2 + idx];
Pvalue = Pvalue + a*b;
//printf("idy:%d,idx:%d,k:%d,a:%d,b:%d,Pvalue:%d\n",idy,idx,k,a,b,Pvalue);
}
gpu_mat_result[idy*Width + idx] = Pvalue;
}
cuda,day-9,矩阵乘法
最新推荐文章于 2023-01-13 16:05:38 发布