实验要求
编写矩阵乘法代码实现,并编译执行;对代码进行执行时间分析,比较不同实现的效率差异;
实验步骤
1.完成GEMM示例,并修改输入数据大小
首先,建立一个test.cpp文件,利用以下代码(来源:https://github.com/pengzhao-intel/oneAPI_course/blob/main/code/gemm_basic.cpp#L154 )写入,将M,N,K值替换成2000,保存关闭。然后进入文件所在终端,利用本地安装的Dpcpp编译器,进行编译操作,在终端输入clang++ -fsycl test.cpp –o test.exe编译exe文件,然后执行该exe文件,代码为./test.exe。稍等一段时间后,就会显示出执行结果数据,相关数据如下:
#include <chrono>
#include <iostream>
#include <CL/sycl.hpp>
#define random_float() (rand() / double(RAND_MAX))
using namespace std;
using namespace sycl;
// return execution time
double gpu_kernel(float *A, float *B, float *C, int M, int N, int K, int block_size, sycl::queue &q) {
// define the workgroup size and mapping
auto grid_rows = (M + block_size - 1) / block_size * block_size;
auto grid_cols = (N + block_size - 1) / block_size * block_size;
auto local_ndrange = range<2>(block_size, block_size);
auto global_ndrange = range<2>(grid_rows, grid_cols);
double duration = 0.0f;
auto e = q.submit([&](sycl::handler &h) {
h.parallel_for<class k_name_t>(
sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {
int row = index.get_global_id(0);
int col = index.get_global_id(1);
float sum = 0.0f;
for (int i = 0; i < K; i++) {
sum += A[row * K + i] * B[i * N + col];
}
C[row * N + col] = sum;
});
});
e.wait();
duration += (e.get_profiling_info<info::event_profiling::command_end>() -
e.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
return(duration);
}
// return execution time
double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) {
double duration = 0.0;
std::chrono::high_resolution_clock::time_point s, e;
// Single Thread Computation in CPU
s = std::chrono::high_resolution_clock::now();
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
float sum = 0.0f;
for(int k = 0; k < K; k++) {
sum += cA[i * K + k] * cB[k * N + j];
}
cC[i * N + j] = sum;
}
}
e = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration<float, std::milli>(e - s).count();
return(duration);
}
int verify(float *cpu_res, float *gpu_res, int length){
int err = 0;
for(int i = 0; i < length; i++) {
if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
err++;
printf("\n%lf, %lf", cpu_res[i], gpu_res[i]);
}
}
return(err);
}
int gemm(const int M,
const int N,
const int K,
const int block_size,
const int iterations,
sycl::queue &q) {
cout << "Problem size: c(" << M << "," << N << ") ="
<< " a(" << M << "," << K << ") *"
<< " b(" << K << "," << N << ")\n";
auto A = malloc_shared<float>(M * K, q);
auto B = malloc_shared<float>(K * N, q);
auto C = malloc_shared<float>(M * N, q);
auto C_host = malloc_host<float>(M * N, q);
// init the A/B/C
for(int i=0; i < M * K; i++) {
A[i] = random_float();
}
for(int i=0; i < K * N; i++) {
B[i] = random_float();
}
for(int i=0; i < M * N; i++) {
C[i] = 0.0f;
C_host[i] = 0.0f;
}
double flopsPerMatrixMul
= 2.0 * static_cast<double>(M) * static_cast<double>(N) * static_cast<double>(K);
double duration_gpu = 0.0f;
double duration_cpu = 0.0f;
// GPU compuation and timer
int warmup = 10;
for (int run = 0; run < iterations + warmup; run++) {
float duration = gpu_kernel(A, B, C, M, N, K, block_size, q);
if(run >= warmup) duration_gpu += duration;
}
duration_gpu = duration_gpu / iterations;
// CPU compuation and timer
warmup = 2;
for(int run = 0; run < iterations/2 + warmup; run++) {
float duration = cpu_kernel(A, B, C_host, M, N, K);
if(run >= warmup) duration_cpu += duration;
}
duration_cpu = duration_cpu / iterations/2;
// Compare the resutls of CPU and GPU
int errCode = 0;
errCode = verify(C_host, C, M*N);
if(errCode > 0) printf("\nThere are %d errors\n", errCode);
printf("\nPerformance Flops = %lf, \n"
"GPU Computation Time = %lf (ms); \n"
"CPU Computaiton Time = %lf (ms); \n",
flopsPerMatrixMul, duration_gpu, duration_cpu);
free(A, q);
free(B, q);
free(C, q);
free(C_host, q);
return(errCode);
}
int main() {
auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
queue my_gpu_queue( cl::sycl::gpu_selector_v , propList);
int errCode = gemm(2000, 2000, 2000, 4, 10, my_gpu_queue);
return(errCode);
}
2.测试不同tile_X和tile_Y大小下( tile_X和tile_Y可以不一致),矩阵计算的性能,并进行结果分析。
首先,建立一个gemm_tile.cpp文件,利用以下代码(来源:https://github.com/pengzhao-intel/oneAPI_course/blob/main/code/gemm_tile.cpp#L12)写入,保存关闭。然后进入文件所在终端,利用本地安装的Dpcpp编译器,进行编译操作,在终端输入clang++ -fsycl gemm_tile.cpp –o gemm_tile.exe编译exe文件,然后执行该exe文件,代码为./ gemm_tile.exe。稍等一段时间后,就会显示出执行结果数据,相关数据如下(此时tile_X和tile_Y的值都设为2,M,N,K值也都替换成2000):
#include <chrono>
#include <iostream>
#include <CL/sycl.hpp>
#define random_float() (rand() / double(RAND_MAX))
using namespace std;
using namespace sycl;
#define tileY 2
#define tileX 2
// return execution time
double gpu_kernel(float *A, float *B, float *C,
int M, int N, int K,
int BLOCK, sycl::queue &q) {
// define the workgroup size and mapping
auto grid_rows = M / tileY;
auto grid_cols = N / tileX;
auto local_ndrange = range<2>(BLOCK, BLOCK);
auto global_ndrange = range<2>(grid_rows, grid_cols);
double duration = 0.0f;
auto e = q.submit([&](sycl::handler &h) {
h.parallel_for<class k_name_t>(
sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {
int row = tileY * index.get_global_id(0);
int col = tileX * index.get_global_id(1);
float sum[tileY][tileX] = {0.0f};
float subA[tileY] = {0.0f};
float subB[tileX] = {0.0f};
// core computation
for (int k = 0; k < N; k++) {
// read data to register
for(int m = 0; m < tileY; m++) {
subA[m] = A[(row + m) * N + k];
}
for(int p = 0; p < tileX; p++) {
subB[p] = B[k * N + p + col];
}
for (int m = 0; m < tileY; m++) {
for (int p = 0; p < tileX; p++) {
sum[m][p] += subA[m] * subB[p];
}
}
} //end of K
// write results back
for (int m = 0; m < tileY; m++) {
for (int p = 0; p < tileX; p++) {
C[(row + m) * N + col + p] = sum[m][p];
}
}
});
});
e.wait();
duration += (e.get_profiling_info<info::event_profiling::command_end>() -
e.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
return(duration);
}
// return execution time
double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) {
double duration = 0.0;
std::chrono::high_resolution_clock::time_point s, e;
// Single Thread Computation in CPU
s = std::chrono::high_resolution_clock::now();
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
float sum = 0.0f;
for(int k = 0; k < K; k++) {
sum += cA[i * K + k] * cB[k * N + j];
}
cC[i * N + j] = sum;
}
}
e = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration<float, std::milli>(e - s).count();
return(duration);
}
int verify(float *cpu_res, float *gpu_res, int length){
int err = 0;
for(int i = 0; i < length; i++) {
if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
err++;
printf("\n%lf, %lf", cpu_res[i], gpu_res[i]);
}
}
return(err);
}
int gemm(const int M,
const int N,
const int K,
const int block_size,
const int iterations,
sycl::queue &q) {
cout << "Problem size: c(" << M << "," << N << ") ="
<< " a(" << M << "," << K << ") *"
<< " b(" << K << "," << N << ")\n";
auto A = malloc_shared<float>(M * K, q);
auto B = malloc_shared<float>(K * N, q);
auto C = malloc_shared<float>(M * N, q);
auto C_host = malloc_host<float>(M * N, q);
// init the A/B/C
for(int i=0; i < M * K; i++) {
A[i] = random_float();
}
for(int i=0; i < K * N; i++) {
B[i] = random_float();
}
for(int i=0; i < M * N; i++) {
C[i] = 0.0f;
C_host[i] = 0.0f;
}
double flopsPerMatrixMul
= 2.0 * static_cast<double>(M) * static_cast<double>(N) * static_cast<double>(K);
double duration_gpu = 0.0f;
double duration_cpu = 0.0f;
// GPU compuation and timer
int warmup = 10;
for (int run = 0; run < iterations + warmup; run++) {
float duration = gpu_kernel(A, B, C, M, N, K, block_size, q);
if(run >= warmup) duration_gpu += duration;
}
duration_gpu = duration_gpu / iterations;
// CPU compuation and timer
warmup = 2;
for(int run = 0; run < iterations/2 + warmup; run++) {
float duration = cpu_kernel(A, B, C_host, M, N, K);
if(run >= warmup) duration_cpu += duration;
}
duration_cpu = duration_cpu / iterations/2;
// Compare the resutls of CPU and GPU
int errCode = 0;
errCode = verify(C_host, C, M*N);
if(errCode > 0) printf("\nThere are %d errors\n", errCode);
printf("\nGEMM size M = %d, N = %d, K = %d", M, N, K);
printf("\nWork-Group size = %d * %d, tile_X = %d, tile_Y = %d", block_size, block_size, tileX, tileY);
printf("\nPerformance Flops = %lf, \n"
"GPU Computation Time = %lf (ms); \n"
"CPU Computaiton Time = %lf (ms); \n",
flopsPerMatrixMul, duration_gpu, duration_cpu);
free(A, q);
free(B, q);
free(C, q);
free(C_host, q);
return(errCode);
}
int main() {
auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
queue my_gpu_queue( cl::sycl::gpu_selector_v , propList);
int errCode = gemm(2000, 2000, 2000, /* GEMM size, M, N, K */
4, /* workgroup size */
10, /* repeat time */
my_gpu_queue);
return(errCode);
}
然后为进行对比实验,再分别对tile_X和tile_Y的值进行修改,修改后重新编译,运行。
当tile_X=2,tile_Y=4时,运行结果图如下:
当tile_X=2,tile_Y=10时,运行结果图如下:
当tile_X=2,tile_Y=20时,运行结果图如下:
当tile_X=2,tile_Y=100时,运行结果图如下:
当tile_X=4,tile_Y=2时,运行结果图如下:
当tile_X=10,tile_Y=2时,运行结果图如下:
当tile_X=20,tile_Y=2时,运行结果图如下:
当tile_X=100,tile_Y=2时,运行结果图如下:
当tile_X=10,tile_Y=10时,运行结果图如下:
当tile_X=100,tile_Y=100时,运行结果图如下:
实验结果分析
通过不同的tile_X和tile_Y值进行实验下,很清楚的看到CPU的计算时间都在5200ms左右,而随着tile_X和tile_Y值变化,GPU的计算时间会不断发生变化。在tile_X值不变时,当tile_Y值很小时,随着其变化,GPU计算时间基本没什么变化,但当该值超过一个阈值后,GPU计算时间也会有个断崖式突然增加。当tile_Y值不变时,仅tile_X值变化时,结论类似。当tile_X和tile_Y值同时增大时,在超过一个阈值后,GPU计算时间仍会有个断崖式突然增加,但随着继续增大,又是一个基本不变的状态,而这种断崖式突然增加数据值可能是由于GPU内存和数据传输方面的限制和瓶颈导致的。值得注意的是,tile_X和tile_Y值的设定与所设置M,N,K等初值有关,当它们的取值不符合要求时,不会显示结果。
实验总结
在本次实验中,我们不仅学习了基于 GPU 的 GEMM 矩阵计算和其优化方法,还发现 tile_X 和 tile_Y 的取值会对计算性能产生影响。因此,我们需要根据实验结果选择最优的 tile_X 和 tile_Y 值。此外,这个实验还让我们了解了 GPU 和 CPU 的计算性能差异,并拓展了对并行计算的理解。