OpenCL(matmpy)

最新推荐文章于 2024-06-23 14:25:49 发布

GayKoo

最新推荐文章于 2024-06-23 14:25:49 发布

阅读量199

点赞数

分类专栏： OpenCL

本文链接：https://blog.csdn.net/GayKoo/article/details/120132584

版权

c++ opencl

OpenCL 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

例程介绍

分别用OpenCL设备和ARM的OpenMP计算1K x 1K的矩阵乘法运算。

例程源码

Host端源码

//main.cpp
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <iostream>
#include <cstdio>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <signal.h>
#include "ocl_util.h"

#ifdef _TI_RTOS
#include "kernel.dsp_h"
#include <assert.h>
#include "../rtos_main.c"
    #if ti_sysbios_BIOS_version <= (0x65200)
    #include <ti/sysbios/posix/time.h>
    #else
    #include <ti/posix/gcc/time.h>
    #endif
#else
#include <time.h>
#include "omp.h"
#endif

/******************************************************************************
* C[N][M] = A[N][K] * B[K][M];
******************************************************************************/
using namespace cl;
using namespace std;
using std::cout;
using std::cerr;
using std::endl;

#define DIM 256
const int mat_N     = DIM;
const int mat_K     = DIM;
const int mat_M     = DIM;

#ifndef _TI_RTOS
float A       [mat_N * mat_K];
float B       [mat_K * mat_M];
float C       [mat_N * mat_M];
float Golden  [mat_N * mat_M];
#endif

static double clock_diff (struct timespec *t1, struct timespec *t2);
static void   print_mat(float *mat, int rows, int cols);
static void   print_result(float *mat, float *gold, int rows, int cols);
static float  dotprod(const float * A, const float * B, int n);
static void   cpu_mat_mpy(const float *A, const float *B, float *C,
                          int N, int K, int M);

/******************************************************************************
* main
******************************************************************************/
#ifdef _TI_RTOS
void ocl_main(UArg arg0, UArg arg1)
{
   // int    argc = (int)     arg0;
   // char **argv = (char **) arg1;
#else
#define RETURN(x) return x
int main(int argc, char *argv[])
{
#endif
   /*-------------------------------------------------------------------------
   * Catch ctrl-c so we ensure that we call dtors and the dsp is reset properly
   *------------------------------------------------------------------------*/
   signal(SIGABRT, exit);
   signal(SIGTERM, exit);

   struct timespec tp_start, tp_end;

   printf("float C[%d][%d] = float A[%d][%d] x float B[%d][%d]\n",
           mat_N, mat_M, mat_N, mat_K, mat_K, mat_M);

   int mat_size = DIM * DIM * sizeof(cl_float);  //矩阵占用空间
#ifdef _TI_RTOS
   float *A      = (float *) __malloc_ddr(mat_size);
   float *B      = (float *) __malloc_ddr(mat_size);
   float *C      = (float *) __malloc_ddr(mat_size);
   float *Golden = (float *) __malloc_ddr(mat_size);
   assert(A != nullptr && B != nullptr && C != nullptr && Golden != nullptr);
#endif

   /*--------------------------------------------------------------------------
   * Initialize the input matrices to random data
   *-------------------------------------------------------------------------*/
   srand(time(NULL));
   for (int i=0; i < mat_N * mat_K; ++i) A[i] = rand() % 5 + 1;
   for (int i=0; i < mat_K * mat_M; ++i) B[i] = rand() % 5 + 1;
   for (int i=0; i < mat_N * mat_M; ++i) C[i] = 0.0;

   try
   {
   	 //模板化创建对应OpenCL设备的context和获取对应device
     Context context(CL_DEVICE_TYPE_ACCELERATOR);
     std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
     int                 nDev    = devices.size();

     /*---------------------------------------------------------------------
     * Compile the Kernel Source for the devices
     *--------------------------------------------------------------------*/
#ifndef _TI_RTOS
     char *bin;
     int bin_length = ocl_read_binary("kernel.out", bin); //读取编译成二进制文件的cl文件

     Program::Binaries   binary(1, std::make_pair(bin, bin_length));
     Program             program = Program(context, devices, binary);
     program.build(devices);  //创建Program并与与device建立链接

     delete [] bin;
#else
     Program::Binaries   binary(1, make_pair(kernel_dsp_bin,
                                             sizeof(kernel_dsp_bin)));
     Program             program = Program(context, devices, binary);
     program.build(devices);
#endif

     Buffer bufB   (context, CL_MEM_READ_ONLY,  mat_size);
     Buffer bufGold(context, CL_MEM_READ_ONLY,  mat_size);
     Kernel kernel (program, "ocl_matmpy"); //为该Program创建对应kernel，第二项参数对应cl函数名

	//为kernel设置参数
     kernel.setArg(1, bufB);
     kernel.setArg(3, __local(mat_K * sizeof(float)));
     kernel.setArg(4, mat_K);
     kernel.setArg(5, mat_N);

     unsigned AChunk = mat_size / nDev; //每个设备处理的数据大小
     unsigned CChunk = mat_size / nDev;

     std::vector<CommandQueue*> Q (nDev); //创建nDEV个CommandQueue

	//Muticore-shared memory controller，多核共享内存控制器
     cl_mem_flags use_msmc = CL_MEM_USE_MSMC_TI;
     for (int d = 0; d < nDev; d++)
     {
         std::string dev_exts;
         devices[d].getInfo(CL_DEVICE_EXTENSIONS, &dev_exts);
         if (dev_exts.find("cl_ti_msmc_buffers") != std::string::npos)
         {
            cl_ulong msmc_size = 0;
            devices[d].getInfo(CL_DEVICE_MSMC_MEM_SIZE_TI, &msmc_size);
            if (msmc_size >= AChunk)  continue;
         }
         use_msmc = 0;  //msmc容量不足，不使用msmc
         break;
     }
     std::vector<Buffer> bufA(nDev, Buffer(context,
                                           CL_MEM_READ_ONLY|use_msmc,  AChunk)); //每个Buffer是基于context，对应flag，大小为AChunk
     std::vector<Buffer> bufC(nDev, Buffer(context, CL_MEM_WRITE_ONLY, CChunk));
     std::vector<Event>  ev(nDev, Event()); //每个device对应一个Event

     for (int d = 0; d < nDev; d++) Q[d]= new CommandQueue (context,devices[d]); //每个device对应一个CommandQueue

     clock_gettime(CLOCK_MONOTONIC, &tp_start);
     for (int d = 0; d < nDev; d++)
     {
     	//命令进队，从Host端写一个buffer 最后一个参数为目的地址
     	//结合clEnqueueWriteBuffer()和c++ Bindings的enqueueWriteBuffer（）查看参数
         Q[d]->enqueueWriteBuffer(bufA[d], CL_FALSE, 0, AChunk,
                                  &A[d*AChunk/sizeof(float)]);

         Q[d]->enqueueWriteBuffer(bufB, CL_FALSE, 0, mat_size, B);

         /*--------------------------------------------------------------------
         * One work item per cell in result matrix
         * One work group per column in result matrix
         *-------------------------------------------------------------------*/
         kernel.setArg(0, bufA[d]);
         kernel.setArg(2, bufC[d]);
         Q[d]->enqueueNDRangeKernel(kernel, NullRange, NDRange(mat_M/nDev),
                                                       NDRange(1)); //参数3表示每个设备处理的维数，参数4表示一个work-item组成一个work-group
         Q[d]->enqueueReadBuffer(bufC[d], CL_FALSE, 0, CChunk,
                                 &C[d*CChunk/sizeof(float)], NULL, &ev[d]);
     }  //等待ev[d]完成后再从OpenCL设备读buffer回Host端

     for (cl_uint d = 0; d < Q.size(); d++) ev[d].wait(); //执行
     clock_gettime(CLOCK_MONOTONIC, &tp_end);

     /*---------------------------------------------------------------------
     * Cleanup OpenCL objects
     *--------------------------------------------------------------------*/
     for (cl_uint d = 0; d < Q.size(); d++) delete Q[d];

     double elapsed = clock_diff (&tp_start, &tp_end);
     printf("OpenCL dispatching to %d DSP(S): %6.4f secs\n", nDev, elapsed);
   }

   catch (Error& err)
   {
     cerr << "ERROR: " << err.what() << "(" << err.err() << ", "
          << ocl_decode_error(err.err()) << ")" << endl;
     exit(-1);
   }

   clock_gettime(CLOCK_MONOTONIC, &tp_start);
   cpu_mat_mpy(A, B, Golden, mat_N, mat_K, mat_M); //OpenMP执行，暂时搁置
   clock_gettime(CLOCK_MONOTONIC, &tp_end);

   double elapsed = clock_diff (&tp_start, &tp_end);
#ifndef _TI_RTOS
   printf("OpenMP dispatching to 4 CPU(S): %6.4f secs\n", elapsed);
#else
   printf("Host dispatching to 1 CPU(S): %6.4f secs\n", elapsed);
#endif

   print_mat(A,      mat_N, mat_K);
   print_mat(B,      mat_K, mat_M);
   print_mat(Golden, mat_N, mat_M);
   print_mat(C,      mat_N, mat_M);

   print_result(C, Golden, mat_N, mat_M);

   for (int i = 0; i < mat_N * mat_M; i++)
       if (Golden[i] != C[i])
       {
           int x = i / mat_M;
           int y = i % mat_M;

           std::cout << "Error at [" << x << "][" << y << "] : "
                     << Golden[i] << " != "
                     << C[i] << std::endl;
           RETURN(-1);
       }

#ifdef _TI_RTOS
   __free_ddr(A);
   __free_ddr(B);
   __free_ddr(C);
   __free_ddr(Golden);
#endif

   std::cout << "Passed!" << std::endl;

   RETURN(0);
}

/******************************************************************************
* cpu_mat_mpy
******************************************************************************/
void cpu_mat_mpy(const float * A, const float * B, float * C, int mat_N,
                 int mat_K, int mat_M)
{
#ifndef _TI_RTOS
    #pragma omp parallel for
#endif
    for (int col = 0; col < mat_M; ++col)
    {
        float b_col[mat_K];

        for (int row = 0; row < mat_K; ++row)
            b_col[row] = B[row*mat_M+col];

        for (int row = 0; row < mat_N; ++row)
            C[row*mat_M+col] = dotprod(A + (row * mat_K), b_col, mat_K);
    }
}

/******************************************************************************
* dotprod
******************************************************************************/
float dotprod(const float * A, const float * B, int n)
{
    float result = 0;
    for (int i = 0; i < n; ++i) result += A[i] * B[i];
    return result;
}

/******************************************************************************
* clock_diff
******************************************************************************/
static double clock_diff (struct timespec *t1, struct timespec *t2)
       { return t2->tv_sec - t1->tv_sec + (t2->tv_nsec - t1->tv_nsec) / 1e9; }

/******************************************************************************
* print_mat
******************************************************************************/
static void print_mat(float *mat, int rows, int cols)
{
    if (cols > 16) return;

    for (int r = 0 ; r < rows; r++)
    {
      for (int c = 0 ; c < cols; c++)
          printf("%3.0f ", mat[r*(cols)+c]);
      printf("\n");
    }
    printf("\n");
}

static void print_result(float *mat, float *gold, int rows, int cols)
{
    if (cols > 64) return;

    for (int r = 0 ; r < rows; r++)
    {
      for (int c = 0 ; c < cols; c++)
          printf("%c", mat[r*(cols)+c] != gold[r*(cols)+c] ? 'x' : '-');
      printf("\n");
    }
    printf("\n");
}

//ccode.cpp
float dotprod(const float* restrict A, const float* restrict B, int n) 
{
    int   i;
    float result = 0;

    _nassert(n > 0);
    _nassert(n % 8 == 0);
    _nassert((int)A % 8 == 0);
    _nassert((int)B % 8 == 0);

    for (i = 0; i < n; ++i) result += A[i] * B[i];
    return result;
}

OpenCL设备端源码

float dotprod   (const global float  *a, const local float  *b, int n);

/******************************************************************************
* C = A x B 
******************************************************************************/
kernel void ocl_matmpy(const global float *a, 
		       const global float *b, 
		             global float *c, 
		             local  float *b_column,
		                    int    a_wid,
                                    int    a_hgt)
{
    int col      = get_global_id(0);
    int width_c  = get_global_size(0);
    int width_a  = a_wid;
    int width_b  = width_c;
    int height_c = a_hgt;
    int height_a = a_hgt;
    int height_b = a_wid;
    int i, row;

	//异步写 有两个函数，由第一、二个参数决定
	//第一个参数，目的buffer
	//第二个参数，源buffer
	//第三个参数，写数量
	//第四个参数，从源（目标，根据不同函数而定）读取一次后的移动步数
	//第五个参数，等待的事件
    event_t ev = async_work_group_strided_copy(b_column, &b[col], 
					       height_b, width_b, 0); //从b中取一列数据存到b_column
    wait_group_events(1, &ev); //等待事件完成，第一个参数表示事件数，第二个参数表示等待的事件

    for (row = 0; row < height_a; ++row)
    {
    	//从参数1读大小为参数2*sizeof(参数1)的数据到全局缓存
        prefetch(&a[row*width_a], width_a); //每次读a的一行到全局缓存
        c[row * width_c + col] = dotprod(&a[row*width_a], b_column, width_a);
    }
}