描述
本文介绍⼀个基于oneAPI的C++/SYCL程序来执行矩阵乘法操作,需要考虑大尺寸矩阵的乘法操作以及不同线程之间的数据依赖关系。通常在实现矩阵乘法时,可以使用块矩阵乘法以及共享内存来提高计算效率
实验环境
本次实验使用英特尔oneAPI Developer Cloud 服务,可免安装额外环境,直接利用Developer Cloud平台中的CPU与GPU硬件完成相应的作业
- 打开DevCloud平台的在线jupyter lab环境
2.在oneAPI_Essentials /02_SYCL_Program_Structure/下使用SYCL_Program_Structure.ipynb页面中的Lab exercise: Vector Add模块进行实验
实验方法
- 在主机端和GPU端分配内存空间用于存储输⼊矩阵和输出矩阵
- 将输入矩阵数据从主机端内存传输到GPU端内存中
- 在SYCL中,矩阵乘法的计算通常会在GPU上使用核函数来实现并行计算。核函数会分配线程块和线程来处理不同的数据块。可以使用sycl::parallel_for方法,并行for循环,对输入矩阵进行矩阵乘法
- 在并行计算矩阵乘法时,可以利用线程块和线程的层次结构来优化计算。通过合理划分矩阵数据并利用共享内存来减少全局内存访问的次数,可以⼤幅提高计算效率。此外可以利用GPU上的多个计算单元并执行行矩阵乘法,进⼀步提高计算速度
- 计算完成后,将输出矩阵数据从GPU端内存传输回主机端内存中,以便进⼀步处理或分析
代码实现
#include <chrono> //记录运行时间
#include <vector>
#include <iostream>
#include <sycl/sycl.hpp>
using namespace std;
using namespace sycl;
double parallel_caculate(std::vector<float> &A, std::vector<float> &B,
std::vector<float> &C, int M, int N, int K, int blocks, sycl::queue &q) {
auto grid_rows = (M + blocks - 1) / blocks * blocks;
auto grid_cols = (N + blocks - 1) / blocks * blocks;
auto local_ndrange = range<2>(blocks, blocks);
auto global_ndrange = range<2>(grid_rows, grid_cols);
double duration = 0.0f;
buffer buf1(A);
buffer buf2(B);
buffer buf3(C);
auto e = q.submit([&](sycl::handler &h) {
accessor A1(buf1,h);
accessor B1(buf2,h);
accessor C1(buf3,h);
h.parallel_for<class k_name_t>( sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {
int row = index.get_global_id(0);
int col = index.get_global_id(1);
float sum = 0.0f;
for (int i = 0; i < K; i++) {
sum += A1[row * K + i] * B1[i * N + col];
}
C1[row * N + col] = sum;
});
});
e.wait();
duration += (e.get_profiling_info<info::event_profiling::command_end>() -
e.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
return duration;
}
double caculate(std::vector<float> &A, std::vector<float> &B, std::vector<float> &C, int M, int N, int K) {
double duration = 0.0;
std::chrono::high_resolution_clock::time_point s, e;
s = std::chrono::high_resolution_clock::now();
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
float sum = 0.0f;
for(int k = 0; k < K; k++) {
sum += A[i * K + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
e = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration<float, std::milli>(e - s).count();
return duration;
}
int verify(std::vector<float> &cpu_res, std::vector<float> &gpu_res, int length){
int err = 0;
for(int i = 0; i < length; i++) {
if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
err++;
}
}
return err;
}
int compute(const int M,const int N,const int K,const int blocks,const int iterations,sycl::queue &q) {
cout << "Problem size: c(" << M << "," << N << ") ="
<< " a(" << M << "," << K << ") *"
<< " b(" << K << "," << N << ")\n";
std::vector<float> A(N*K);
std::vector<float> B(K*M);
std::vector<float> C(N*M);
std::vector<float> C2(N*M);
for(int i=0; i < M * K; i++) {
A[i] = rand() / double(RAND_MAX);
}
for(int i=0; i < K * N; i++) {
B[i] = rand() / double(RAND_MAX);
}
for(int i=0; i < M * N; i++) {
C[i] = 0.0f;
C2[i] = 0.0f;
}
double duration_gpu = 0.0f;
double duration_cpu = 0.0f;
int warmup = 10;
for (int run = 0; run < iterations + warmup; run++) {
float duration = parallel_caculate(A, B, C, M, N, K, blocks, q);
if(run >= warmup)
duration_gpu += duration;
}
duration_gpu = duration_gpu / iterations;
warmup = 2;
for(int run = 0; run < iterations/2 + warmup; run++) {
float duration = caculate(A, B, C2, M, N, K);
if(run >= warmup)
duration_cpu += duration;
}
duration_cpu = duration_cpu / iterations/2;
int errCode = 0;
errCode = verify(C2, C, M*N);
printf(
"并行计算时间为:%lf (ms); \n"
"循环计算时间为:%lf (ms); \n",
duration_gpu, duration_cpu);
return errCode;
}
int main() {
auto propList = sycl::property_list {sycl::property::queue::enable_profiling()};
queue test_q( sycl::gpu_selector_v , propList); //使用gpu设备选择器
int errCode = compute(1024,1024,1024, 4, 10, test_q);
return errCode;
}
学习成果
本次实验使用oneAPI的sycl库实现了并行矩阵乘法,了解到相关的基础概念,对SYCL的编程框架有了更深一步的认识,此外也对Intel oneAPI Developer Cloud 平台的使用有了初步的体验