#include <cstdio>
#include <iostream>
#include <time.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#include "device_launch_parameters.h"
int array_n = 500;
using namespace std;
void Get_exmaples(double** &a, double** &b, double ** &c);
void Print_GPU_Status();
double** Array_HostToDevice(double **A);
double** Array_DeviceToHost(double **gpu_A);
bool GPU_C_is_GPU_C(double **gpu_C, double** A, double **B);
__global__ void Array_MUl(double** A, double** B, double **C, int array_n)
{
int row = blockDim.x * blockIdx.x + threadIdx.x;
int col = blockDim.y * blockIdx.y + threadIdx.y;
if(row < array_n && col < array_n)
{
double value = 0;
for(int i = 0; i < array_n; i++)
value += A[row][i] * B[i][col];
C[row][col] = value;
}
}
void Main_Product()
{
double** A, **B, **C;
Get_exmaples(A, B, C);
double** gpu_A, **gpu_B, **gpu_C;
gpu_A = Array_HostToDevice(A);
gpu_B = Array_HostToDevice(B);
gpu_C = Array_HostToDevice(B);//随便设置一个值
dim3 blocksize(10, 10);
dim3 grimsize(array_n / blocksize.x + 1 , array_n / blocksize.y + 1);
cudaEvent_t startTime, endTime;
cudaEventCreate(&startTime);
cudaEventCreate(&endTime);
cudaEventRecord(startTime, 0);
cudaDeviceSynchronize();
Array_MUl<<<grimsize, blocksize>>>(gpu_A, gpu_B, gpu_C, array_n);
cudaEventRecord(endTime, 0);
cudaEventSynchronize(startTime);
cudaEventSynchronize(endTime);
float Time;
cudaEventElapsedTime(&Time, startTime, endTime);
printf(" time (GPU) : %f ms \n", Time);
cudaEventDestroy(startTime);
cudaEventDestroy(endTime);
clock_t Start_time, End_time;
double **mid_data;
mid_data = new double*[array_n];
cudaMemcpy(mid_data, gpu_C, sizeof(double*) * array_n, cudaMemcpyDeviceToHost);
for(int i = 0; i < array_n; i++)
cudaMemcpy(C[i], mid_data[i], sizeof(double) * array_n, cudaMemcpyDeviceToHost);
Start_time = clock();
cout << GPU_C_is_GPU_C(C, A, B) << endl;
End_time = clock();
printf(" time (CPU) : %f ms \n", float(End_time - Start_time));
cudaFree(gpu_C);
cudaFree(gpu_A);
cudaFree(gpu_B);
}
int main()
{
int n[] = {500,1000, 2000};
for(int i = 0; i < sizeof(n) / sizeof(int); i++)
{
array_n = n[i];
cout << "________________ " << n[i] << " ______________" << endl;
Main_Product();
}
}
double** Array_HostToDevice(double **A)
{i
double **gpu_A;
double **mid_data;
mid_data = new double*[array_n];
cudaMalloc((void***)&gpu_A, array_n * sizeof(double*));
for(int i = 0; i < array_n; i++)
{
cudaMalloc((void**)&(mid_data[i]), sizeof(double) * array_n);
cudaMemcpy(mid_data[i], A[i], sizeof (double) * array_n, cudaMemcpyHostToDevice);
}
cudaMemcpy(gpu_A, mid_data, array_n * sizeof(double*), cudaMemcpyHostToDevice);
return gpu_A;
}
bool GPU_C_is_GPU_C(double **gpu_C, double** A, double **B)
{
double **cpu_C;
cpu_C = new double*[array_n];
for(int i = 0; i < array_n; i++)
cpu_C[i] = new double[array_n];
for(int i = 0; i < array_n; i++)
for(int j = 0; j < array_n; j++)
{
double value = 0;
for(int k = 0; k < array_n; k++)
value += A[i][k] * B[k][j];
cpu_C[i][j] = value;
}
for(int i = 0; i < array_n; i++)
for(int j = 0; j < array_n; j++)
if(cpu_C[i][j] != gpu_C[i][j])
return false;
return true;
}
void Get_exmaples(double** &a, double** &b, double ** &c)
{
a = new double* [array_n];
b = new double* [array_n];
c = new double* [array_n];
for (int i = 0; i < array_n; i++)
{
a[i] = new double[array_n];
b[i] = new double[array_n];
c[i] = new double[array_n];
for (int j = 0; j < array_n; j++)
{
a[i][j] = i * array_n + j;
b[i][j] = i * array_n + j + 1;
}
}
}
void Print_Array(double** x)
{
for (int i = 0; i < array_n; i++)
{
for (int j = 0; j < array_n; j++)
{
cout << " " << x[i][j];
}
cout << endl;
}
}
//void Print_GPU_Status()
//{
// int deviceCount;
// cudaGetDeviceCount(&deviceCount);
// for (int dev = 0; dev < deviceCount; dev++)
// {
// int driver_version(0), runtime_version(0);
// cudaDeviceProp deviceProp;
// cudaGetDeviceProperties(&deviceProp, dev);
// if (dev == 0)
// if (deviceProp.minor = 9999 && deviceProp.major == 9999)
// printf("\n");
// printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name);
// cudaDriverGetVersion(&driver_version);
// printf("CUDA驱动版本: %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10);
// cudaRuntimeGetVersion(&runtime_version);
// printf("CUDA运行时版本: %d.%d\n", runtime_version / 1000, (runtime_version % 1000) / 10);
// printf("设备计算能力: %d.%d\n", deviceProp.major, deviceProp.minor);
// printf("Total amount of Global Memory: %u bytes\n", deviceProp.totalGlobalMem);
// printf("Number of SMs: %d\n", deviceProp.multiProcessorCount);
// printf("Total amount of Constant Memory: %u bytes\n", deviceProp.totalConstMem);
// printf("Total amount of Shared Memory per block: %u bytes\n", deviceProp.sharedMemPerBlock);
// printf("Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
// printf("Warp size: %d\n", deviceProp.warpSize);
// printf("Maximum number of threads per SM: %d\n", deviceProp.maxThreadsPerMultiProcessor);
// printf("Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
// printf("Maximum size of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0],
// deviceProp.maxThreadsDim[1],
// deviceProp.maxThreadsDim[2]);
// printf("Maximum size of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
// printf("Maximum memory pitch: %u bytes\n", deviceProp.memPitch);
// printf("Texture alignmemt: %u bytes\n", deviceProp.texturePitchAlignment);
// printf("Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f);
// printf("Memory Clock rate: %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f);
// printf("Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
// }
//}
并行计算-矩阵相乘
于 2021-04-09 17:07:28 首次发布