#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <iostream>
using namespace std;
#define numElements 256
//windows下的计时;
#include <windows.h>
double get_time() {
LARGE_INTEGER timer;
static LARGE_INTEGER fre;
static int init = 0;
double t;
if (init != 1) {
QueryPerformanceFrequency(&fre);
init = 1;
}
QueryPerformanceCounter(&timer);
t = timer.QuadPart * 1. / fre.QuadPart;
return t;
}
void Host_MarticMultiply(int A[][numElements], int B[][numElements], int C[][numElements]) {
for (int i = 0; i < numElements; i++) {
for (int j = 0; j < numElements; j++) {
for (int k = 0; k < numElements; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
__global__ void MatixMultiply(int h_a[][numElements], int h_b[][numElements], int h_c[][numElements])
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < numElements && j < numElements) {
for (int k = 0; k < numElements; k++) {
h_c[i][j] += h_a[i][k] * h_b[k][j];
}
}
}
void textValue(int h_c[][numElements]) {
cout << "测试程序:测试三组数据" << endl;
for (int i = 0; i < 5; i++)
{
for (int j = 0; j < 5; j++)
cout << h_c[i][j] << "\t";
cout << endl;
}
cout << endl;
for (int i = 100; i < 105; i++)
{
for (int j = 0; j < 5; j++)
cout << h_c[i][j] << "\t";
cout << endl;
}
cout << endl;
for (int i = 250; i < 255; i++)
{
for (int j = 0; j < 5; j++)
cout << h_c[i][j] << "\t";
cout << endl;
}
}
int main()
{
int(*h_a)[numElements] = new int[numElements][numElements];
int(*h_b)[numElements] = new int[numElements][numElements];
int(*h_c)[numElements] = new int[numElements][numElements];
int(*h_d)[numElements] = new int[numElements][numElements];
for (int i = 0; i < numElements; i++) {
for (int j = 0; j < numElements; j++) {
h_a[i][j] = i + 1;
h_b[i][j] = i + 1;
h_c[i][j] = 0;
h_d[i][j] = 0;
}
}
cudaError_t error = cudaSuccess;
int(*d_a)[numElements], (*d_b)[numElements], (*d_c)[numElements];
error = cudaMalloc((void**)&d_a, sizeof(int) * numElements * numElements);
error = cudaMalloc((void**)&d_b, sizeof(int) * numElements * numElements);
error = cudaMalloc((void**)&d_c, sizeof(int) * numElements * numElements);
cudaMemcpy(d_a, h_a, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, h_c, sizeof(int) * numElements * numElements, cudaMemcpyHostToDevice);
double td = 0;
double th = 0;
//GPU执行时间
cudaThreadSynchronize();//锁定直到前面进程执行结束
td = get_time();
dim3 blocks(32, 32);
dim3 threads(32, 32);
MatixMultiply << <blocks, threads >> > (d_a, d_b, d_c);
cudaThreadSynchronize();
td = get_time() - td;
cout << "GPU执行时间:" << td << endl;//得到GPU执行最终时间
//copy回主存
cudaMemcpy(h_c, d_c, sizeof(int) * numElements * numElements, cudaMemcpyDeviceToHost);
//CPU执行时间
cudaThreadSynchronize();//锁定直到前面进程执行结束
th = get_time();
Host_MarticMultiply(h_a,h_b,h_d);
cudaThreadSynchronize();
th = get_time() - th;
cout << "CPU执行时间:" << th << endl;//得到CPU执行最终时间
cout << "GPU:" << endl;
textValue(h_c);
cout << endl;
cout << "CPU:" << endl;
textValue(h_d);
cout << endl;
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
CUDA矩阵乘
最新推荐文章于 2024-04-29 13:34:21 发布