
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iomanip>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <ctime>
using namespace std;
void init_matrix(double *M, int N) {
for (int i = 0; i < N * N; ++i)
M[i] = rand() % 100;
return;
}
int main() {
int N;
cudaSetDevice(6);
time_t t;
srand((unsigned) time(&t));
scanf("%d", &N);
double *A, *B, *C, *D;
size_t mem_size = sizeof(double) * N * N;
A = (double *) malloc(mem_size);
B = (double *) malloc(mem_size);
C = (double *) malloc(mem_size);
D = (double *) malloc(mem_size);
init_matrix(A, N);
init_matrix(B, N);
clock_t start2, stop2;
start2 = clock();
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
int t = 0;
for (int k = 0; k < N; k++) {
t += A[i * N + k] * B[k * N + j];
}
D[i * N + j] = t;
}
}
stop2 = clock();
float time2 = (float) (stop2 - start2) / CLOCKS_PER_SEC;
double error = 0;
for (int i = 0; i < N * N; i++) {
error += fabs(C[i] - D[i]);
}
printf("error %lf\n", error);
printf("CPU time %lfs\n", time2);
free(A), free(B), free(C), free(D);
return 0;
}
1.3.2 GPU程序
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iomanip>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <ctime>
using namespace std;
void init_matrix(double *M, int N) {
for (int i = 0; i < N * N; ++i)
M[i] = rand() % 100;
return;
}
__global__
void multimatrix(double *A, double *B, double *C, int N) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int row = idx / N;
int column = idx % N;
if (row < N && column < N) {
int t = 0;
for (int i = 0; i < N; i++) {
t += A[row * N + i] * B[i * N + column];
}
C[row * N + column] = t;
}
}
int main() {
int N;
cudaSetDevice(6);
time_t t;
srand((unsigned) time(&t));
scanf("%d", &N);
double *A, *B, *C, *D;
size_t mem_size = sizeof(double) * N * N;
A = (double *) malloc(mem_size);
B = (double *) malloc(mem_size);
C = (double *) malloc(mem_size);
D = (double *) malloc(mem_size);
init_matrix(A, N);
init_matrix(B, N);
//gpu
clock_t start1, stop1;
start1 = clock();
double *CUDA_A, *CUDA_B, *CUDA_C;
cudaMalloc(&CUDA_A, mem_size);
cudaMemcpy(CUDA_A, A, mem_size, cudaMemcpyHostToDevice);
cudaMalloc(&CUDA_B, mem_size);
cudaMemcpy(CUDA_B, B, mem_size, cudaMemcpyHostToDevice);
cudaMalloc(&CUDA_C, mem_size);
int thread_num = 1 << 8;
int block_num = (N * N + thread_num - 1) / thread_num;
multimatrix<<<block_num, thread_num>>>(CUDA_A, CUDA_B, CUDA_C, N);
cudaDeviceSynchronize(); //进行同步
cudaMemcpy(C, CUDA_C, mem_size, cudaMemcpyDeviceToHost);
cudaFree(CUDA_A), cudaFree(CUDA_B), cudaFree(CUDA_C);
stop1 = clock();
float time1 = (float) (stop1 - start1) / CLOCKS_PER_SEC;
printf("error %lf\n", error);
printf("GPU time %lfs\n", time1);
free(A), free(B), free(C), free(D);
return 0;
}
1.3.3 性能对比与分析
CPU串行和GPU并行在矩阵不同规模下的运行时间如图1所示。

在矩阵规模较小时,CPU串行和GPU并行的程序执行时间无较大差别,但随着问题规模的增大,两者间差距逐渐变大。GPU运行时间增长速度较小,但CPU串行的运行时间随矩阵规模的增大变化幅度增长速度快。



文章介绍了使用CUDA进行GPU并行矩阵乘法与CPU串行计算的性能对比,随着矩阵规模增大,GPU的运行时间增长缓慢,与CPU的差距逐渐拉大。

361

被折叠的 条评论
为什么被折叠?



