#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define M 8
#define K 8
#define N 8
void initial(double *array, int size)
{
for (int i = 0; i < size; i++)
{
array[i] = (double)(rand() % 10 + 1);
}
}
void printMatrix(double *array, int row, int col)
{
double *p = array;
for (int y = 0; y < row; y++)
{
for (int x = 0; x < col; x++)
{
printf("%10lf", p[x]);
}
p = p + col;
printf("\n");
}
return;
}
void multiplicateMatrixOnHost(double *array_A, double *array_B, double *array_C, int M_p, int K_p, int N_p)
{
for (int i = 0; i < M_p; i++)
{
for (int j = 0; j < N_p; j++)
{
double sum = 0;
for (int k = 0; k < K_p; k++)
{
sum += array_A[i*K_p + k] * array_B[k*N_p + j];
}
array_C[i*N_p + j] = sum;
}
}
}
__global__ void multiplicateMatrixOnDevice(double *array_A, double *array_B, double *array_C, int M_p, int K_p, int N_p)
{
int ix = threadIdx.x + blockDim.x*blockIdx.x;//col number
int iy = threadIdx.y + blockDim.y*blockIdx.y;//row number
if (ix < N_p && iy < M_p)
{
double sum = 0;
for (int k = 0; k < K_p; k++)
{
sum += array_A[iy*K_p + k] * array_B[k*N_p + ix];
}
array_C[iy*N_p + ix] = sum;
}
}
int main(int argc, char **argv)
{
clock_t start = 0, finish = 0;
double time;
int Axy = M * K;
int ABytes = Axy * sizeof(double);
int Bxy = K * N;
int BBytes = Bxy * sizeof(double);
double *h_A, *h_B, *hostRef, *deviceRef;
h_A = (double*)malloc(ABytes);
h_B = (double*)malloc(BBytes);
int nBytes = M * N * sizeof(double);
hostRef = (double*)malloc(nBytes);
deviceRef = (double*)malloc(nBytes);
initial(h_A, Axy);
printf("\n");
printf("Matrix_A: (%d×%d)\n", M, K);
printMatrix(h_A, M,K);
initial(h_B, Bxy);
printf("Matrix_B: (%d×%d)\n", K, N);
printMatrix(h_B, K,N);
start = clock();
multiplicateMatrixOnHost(h_A, h_B, hostRef, M, K, N);
finish = clock();
time = (double)(finish - start) / CLOCKS_PER_SEC;
printf("\n");
printf("------------------------------------------------------------------------------------\n");
printf("Computing matrix product using multiplicateMatrixOnHost \n");
printf("------------------------------------------------------------------------------------\n");
printf("Matrix_hostRef: (%d×%d) CPU运行时间为:%lfs\n", M, N, time);
printMatrix(hostRef, M,N);
double *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, ABytes);
cudaMalloc((void**)&d_B, BBytes);
cudaMalloc((void**)&d_C, nBytes);
cudaMemcpy(d_A, h_A, ABytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, BBytes, cudaMemcpyHostToDevice);
printf("\n\n");
printf("------------------------------------------------------------------------------------\n");
printf("Computing matrix product using multiplicateMatrixOnDevice \n");
printf("------------------------------------------------------------------------------------\n");
int dimx = 16;
int dimy = 16;
dim3 block(dimx, dimy);
dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
// dim3 grid(1, 1);
cudaEvent_t gpustart, gpustop;
float elapsedTime = 0.0;
cudaEventCreate(&gpustart);
cudaEventCreate(&gpustop);
cudaEventRecord(gpustart, 0);
multiplicateMatrixOnDevice << <grid, block >> > (d_A, d_B, d_C, M, K, N);
// printf(" multiplicateMatrixOnDevice<<<(%d,%d),(%d,%d)>>>", grid.x, grid.y, block.x, block.y);
cudaDeviceSynchronize();
cudaEventRecord(gpustop, 0);
cudaEventSynchronize(gpustop);
cudaEventElapsedTime(&elapsedTime, gpustart, gpustop);
cudaEventDestroy(gpustart);
cudaEventDestroy(gpustop);
cudaMemcpy(deviceRef, d_C, nBytes, cudaMemcpyDeviceToHost);
printf("Matrix_deviceRef: (%d×%d) <<<(%d,%d),(%d,%d)>>> GPU运行时间为:%fs\n",
M, N, grid.x, grid.y, block.x, block.y, elapsedTime/1000);
printMatrix(deviceRef, M,N);
printf("加速比为: %lf\n\n", time / (elapsedTime / 1000));
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(hostRef);
free(deviceRef);
cudaDeviceReset();
return (0);
}
矩阵乘法-CUDA CPU&GPU加速比
最新推荐文章于 2023-09-05 16:06:05 发布