例程介绍
分别用OpenCL设备和ARM的OpenMP计算1K x 1K的矩阵乘法运算。
例程源码
Host端源码
//main.cpp
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <iostream>
#include <cstdio>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <signal.h>
#include "ocl_util.h"
#ifdef _TI_RTOS
#include "kernel.dsp_h"
#include <assert.h>
#include "../rtos_main.c"
#if ti_sysbios_BIOS_version <= (0x65200)
#include <ti/sysbios/posix/time.h>
#else
#include <ti/posix/gcc/time.h>
#endif
#else
#include <time.h>
#include "omp.h"
#endif
/******************************************************************************
* C[N][M] = A[N][K] * B[K][M];
******************************************************************************/
using namespace cl;
using namespace std;
using std::cout;
using std::cerr;
using std::endl;
#define DIM 256
const int mat_N = DIM;
const int mat_K = DIM;
const int mat_M = DIM;
#ifndef _TI_RTOS
float A [mat_N * mat_K];
float B [mat_K * mat_M];
float C [mat_N * mat_M];
float Golden [mat_N * mat_M];
#endif
static double clock_diff (struct timespec *t1, struct timespec *t2);
static void print_mat(float *mat, int rows, int cols);
static void print_result(float *mat, float *gold, int rows, int cols);
static float dotprod(const float * A, const float * B, int n);
static void cpu_mat_mpy(const float *A, const float *B, float *C,
int N, int K, int M);
/******************************************************************************
* main
******************************************************************************/
#ifdef _TI_RTOS
void ocl_main(UArg arg0, UArg arg1)
{
// int argc = (int) arg0;
// char **argv = (char **) arg1;
#else
#define RETURN(x) return x
int main(int argc, char *argv[])
{
#endif
/*-------------------------------------------------------------------------
* Catch ctrl-c so we ensure that we call dtors and the dsp is reset properly
*------------------------------------------------------------------------*/
signal(SIGABRT, exit);
signal(SIGTERM, exit);
struct timespec tp_start, tp_end;
printf("float C[%d][%d] = float A[%d][%d] x float B[%d][%d]\n",
mat_N, mat_M, mat_N, mat_K, mat_K, mat_M);
int mat_size = DIM * DIM * sizeof(cl_float); //矩阵占用空间
#ifdef _TI_RTOS
float *A = (float *) __malloc_ddr(mat_size);
float *B = (float *) __malloc_ddr(mat_size);
float *C = (float *) __malloc_ddr(mat_size);
float *Golden = (float *) __malloc_ddr(mat_size);
assert(A != nullptr && B != nullptr && C != nullptr && Golden != nullptr);
#endif
/*--------------------------------------------------------------------------
* Initialize the input matrices to random data
*-------------------------------------------------------------------------*/
srand(time(NULL));
for (int i=0; i < mat_N * mat_K; ++i) A[i] = rand() % 5 + 1;
for (int i=0; i < mat_K * mat_M; ++i) B[i] = rand() % 5 + 1;
for (int i=0; i < mat_N * mat_M; ++i) C[i] = 0.0;
try
{
//模板化创建对应OpenCL设备的context和获取对应device
Context context(CL_DEVICE_TYPE_ACCELERATOR);
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
int nDev = devices.size();
/*---------------------------------------------------------------------
* Compile the Kernel Source for the devices
*--------------------------------------------------------------------*/
#ifndef _TI_RTOS
char *bin;
int bin_length = ocl_read_binary("kernel.out", bin); //读取编译成二进制文件的cl文件
Program::Binaries binary(1, std::make_pair(bin, bin_length));
Program program = Program(context, devices, binary);
program.build(devices); //创建Program并与与device建立链接
delete [] bin;
#else
Program::Binaries binary(1, make_pair(kernel_dsp_bin,
sizeof(kernel_dsp_bin)));
Program program = Program(context, devices, binary);
program.build(devices);
#endif
Buffer bufB (context, CL_MEM_READ_ONLY, mat_size);
Buffer bufGold(context, CL_MEM_READ_ONLY, mat_size);
Kernel kernel (program, "ocl_matmpy"); //为该Program创建对应kernel,第二项参数对应cl函数名
//为kernel设置参数
kernel.setArg(1, bufB);
kernel.setArg(3, __local(mat_K * sizeof(float)));
kernel.setArg(4, mat_K);
kernel.setArg(5, mat_N);
unsigned AChunk = mat_size / nDev; //每个设备处理的数据大小
unsigned CChunk = mat_size / nDev;
std::vector<CommandQueue*> Q (nDev); //创建nDEV个CommandQueue
//Muticore-shared memory controller,多核共享内存控制器
cl_mem_flags use_msmc = CL_MEM_USE_MSMC_TI;
for (int d = 0; d < nDev; d++)
{
std::string dev_exts;
devices[d].getInfo(CL_DEVICE_EXTENSIONS, &dev_exts);
if (dev_exts.find("cl_ti_msmc_buffers") != std::string::npos)
{
cl_ulong msmc_size = 0;
devices[d].getInfo(CL_DEVICE_MSMC_MEM_SIZE_TI, &msmc_size);
if (msmc_size >= AChunk) continue;
}
use_msmc = 0; //msmc容量不足,不使用msmc
break;
}
std::vector<Buffer> bufA(nDev, Buffer(context,
CL_MEM_READ_ONLY|use_msmc, AChunk)); //每个Buffer是基于context,对应flag,大小为AChunk
std::vector<Buffer> bufC(nDev, Buffer(context, CL_MEM_WRITE_ONLY, CChunk));
std::vector<Event> ev(nDev, Event()); //每个device对应一个Event
for (int d = 0; d < nDev; d++) Q[d]= new CommandQueue (context,devices[d]); //每个device对应一个CommandQueue
clock_gettime(CLOCK_MONOTONIC, &tp_start);
for (int d = 0; d < nDev; d++)
{
//命令进队,从Host端写一个buffer 最后一个参数为目的地址
//结合clEnqueueWriteBuffer()和c++ Bindings的enqueueWriteBuffer()查看参数
Q[d]->enqueueWriteBuffer(bufA[d], CL_FALSE, 0, AChunk,
&A[d*AChunk/sizeof(float)]);
Q[d]->enqueueWriteBuffer(bufB, CL_FALSE, 0, mat_size, B);
/*--------------------------------------------------------------------
* One work item per cell in result matrix
* One work group per column in result matrix
*-------------------------------------------------------------------*/
kernel.setArg(0, bufA[d]);
kernel.setArg(2, bufC[d]);
Q[d]->enqueueNDRangeKernel(kernel, NullRange, NDRange(mat_M/nDev),
NDRange(1)); //参数3表示每个设备处理的维数,参数4表示一个work-item组成一个work-group
Q[d]->enqueueReadBuffer(bufC[d], CL_FALSE, 0, CChunk,
&C[d*CChunk/sizeof(float)], NULL, &ev[d]);
} //等待ev[d]完成后再从OpenCL设备读buffer回Host端
for (cl_uint d = 0; d < Q.size(); d++) ev[d].wait(); //执行
clock_gettime(CLOCK_MONOTONIC, &tp_end);
/*---------------------------------------------------------------------
* Cleanup OpenCL objects
*--------------------------------------------------------------------*/
for (cl_uint d = 0; d < Q.size(); d++) delete Q[d];
double elapsed = clock_diff (&tp_start, &tp_end);
printf("OpenCL dispatching to %d DSP(S): %6.4f secs\n", nDev, elapsed);
}
catch (Error& err)
{
cerr << "ERROR: " << err.what() << "(" << err.err() << ", "
<< ocl_decode_error(err.err()) << ")" << endl;
exit(-1);
}
clock_gettime(CLOCK_MONOTONIC, &tp_start);
cpu_mat_mpy(A, B, Golden, mat_N, mat_K, mat_M); //OpenMP执行,暂时搁置
clock_gettime(CLOCK_MONOTONIC, &tp_end);
double elapsed = clock_diff (&tp_start, &tp_end);
#ifndef _TI_RTOS
printf("OpenMP dispatching to 4 CPU(S): %6.4f secs\n", elapsed);
#else
printf("Host dispatching to 1 CPU(S): %6.4f secs\n", elapsed);
#endif
print_mat(A, mat_N, mat_K);
print_mat(B, mat_K, mat_M);
print_mat(Golden, mat_N, mat_M);
print_mat(C, mat_N, mat_M);
print_result(C, Golden, mat_N, mat_M);
for (int i = 0; i < mat_N * mat_M; i++)
if (Golden[i] != C[i])
{
int x = i / mat_M;
int y = i % mat_M;
std::cout << "Error at [" << x << "][" << y << "] : "
<< Golden[i] << " != "
<< C[i] << std::endl;
RETURN(-1);
}
#ifdef _TI_RTOS
__free_ddr(A);
__free_ddr(B);
__free_ddr(C);
__free_ddr(Golden);
#endif
std::cout << "Passed!" << std::endl;
RETURN(0);
}
/******************************************************************************
* cpu_mat_mpy
******************************************************************************/
void cpu_mat_mpy(const float * A, const float * B, float * C, int mat_N,
int mat_K, int mat_M)
{
#ifndef _TI_RTOS
#pragma omp parallel for
#endif
for (int col = 0; col < mat_M; ++col)
{
float b_col[mat_K];
for (int row = 0; row < mat_K; ++row)
b_col[row] = B[row*mat_M+col];
for (int row = 0; row < mat_N; ++row)
C[row*mat_M+col] = dotprod(A + (row * mat_K), b_col, mat_K);
}
}
/******************************************************************************
* dotprod
******************************************************************************/
float dotprod(const float * A, const float * B, int n)
{
float result = 0;
for (int i = 0; i < n; ++i) result += A[i] * B[i];
return result;
}
/******************************************************************************
* clock_diff
******************************************************************************/
static double clock_diff (struct timespec *t1, struct timespec *t2)
{ return t2->tv_sec - t1->tv_sec + (t2->tv_nsec - t1->tv_nsec) / 1e9; }
/******************************************************************************
* print_mat
******************************************************************************/
static void print_mat(float *mat, int rows, int cols)
{
if (cols > 16) return;
for (int r = 0 ; r < rows; r++)
{
for (int c = 0 ; c < cols; c++)
printf("%3.0f ", mat[r*(cols)+c]);
printf("\n");
}
printf("\n");
}
static void print_result(float *mat, float *gold, int rows, int cols)
{
if (cols > 64) return;
for (int r = 0 ; r < rows; r++)
{
for (int c = 0 ; c < cols; c++)
printf("%c", mat[r*(cols)+c] != gold[r*(cols)+c] ? 'x' : '-');
printf("\n");
}
printf("\n");
}
//ccode.cpp
float dotprod(const float* restrict A, const float* restrict B, int n)
{
int i;
float result = 0;
_nassert(n > 0);
_nassert(n % 8 == 0);
_nassert((int)A % 8 == 0);
_nassert((int)B % 8 == 0);
for (i = 0; i < n; ++i) result += A[i] * B[i];
return result;
}
OpenCL设备端源码
float dotprod (const global float *a, const local float *b, int n);
/******************************************************************************
* C = A x B
******************************************************************************/
kernel void ocl_matmpy(const global float *a,
const global float *b,
global float *c,
local float *b_column,
int a_wid,
int a_hgt)
{
int col = get_global_id(0);
int width_c = get_global_size(0);
int width_a = a_wid;
int width_b = width_c;
int height_c = a_hgt;
int height_a = a_hgt;
int height_b = a_wid;
int i, row;
//异步写 有两个函数,由第一、二个参数决定
//第一个参数,目的buffer
//第二个参数,源buffer
//第三个参数,写数量
//第四个参数,从源(目标,根据不同函数而定)读取一次后的移动步数
//第五个参数,等待的事件
event_t ev = async_work_group_strided_copy(b_column, &b[col],
height_b, width_b, 0); //从b中取一列数据存到b_column
wait_group_events(1, &ev); //等待事件完成,第一个参数表示事件数,第二个参数表示等待的事件
for (row = 0; row < height_a; ++row)
{
//从参数1读大小为参数2*sizeof(参数1)的数据到全局缓存
prefetch(&a[row*width_a], width_a); //每次读a的一行到全局缓存
c[row * width_c + col] = dotprod(&a[row*width_a], b_column, width_a);
}
}