本文是一个利用DMA通信的例子,主要参考官方文档实现。软件的操作步骤可以参考前面文章: 链接
一、Overview
Zedboard开发板由PS(Processing System)和PL(Programmable Logic)两部分构成,PS部分由两个ARM CPU核构成,PL部分是FPGA可编程逻辑。本文通过DMA(Direct Memory Access)方式实现两部分的通信,对矩阵乘应用进行加速。系统架构图如下所示,左侧是PS端,右端是PL端。控制信号通过PS端的GP0接口与PL端的IP核交互(AXI-lite总线),读写数据通过PS端的ACP接口与PL端进行交互(AXI-Stream总线)。DMA和Timer是利用Vivado中固有IP核实现,HLS IP core是利用Vivado HLS编写的矩阵乘IP核。
基本参数
工具版本:2017.4
实现功能:矩阵乘(float类型)
传输方式:DMA
传输总线:AXI-stream
二、HLS
HLS部分的代码文件如下。参考上一篇文章的步骤,直接综合、生成IP核即可。// mmult_accel.cpp
#include <stdio.h>
#include <stdlib.h>
#include "mmult.h"
#define MCR_SIZE 1024
void standalone_mmult (float A[32][32], float B[32][32], float C[32][32])
{
mmult_hw <float, 32>(A, B, C);
}
void HLS_accel (AXI_VAL INPUT_STREAM[2*MCR_SIZE], AXI_VAL OUTPUT_STREAM[MCR_SIZE])
{
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
#pragma HLS INTERFACE axis port=OUTPUT_STREAM
#pragma HLS INTERFACE axis port=INPUT_STREAM
wrapped_mmult_hw <float, 32, 32*32, 4, 5, 5>(INPUT_STREAM, OUTPUT_STREAM);
return;
}
// mmult.h
#include <assert.h>
#include <ap_axi_sdata.h>
#define MCR_SIZE 1024
typedef ap_axiu<32,4,5,5> AXI_VAL; // 定义AXI-stream的副通道类型
// function prototypes
void standalone_mmult (float A[32][32], float B[32][32], float C[32][32]);
void HLS_accel (AXI_VAL in_stream[2*MCR_SIZE], AXI_VAL out_stream[MCR_SIZE]);
/* ****************************** C++ TEMPLATES ************************************** */
// reference compute: function—multiply
template <typename T, int DIM>
void matrix_multiply_ref(T a[DIM][DIM], T b[DIM][DIM], T out[DIM][DIM])
{
// matrix multiplication of a A*B matrix
for (int ia = 0; ia < DIM; ++ia)
for (int ib = 0; ib < DIM; ++ib)
{
float sum = 0;
for (int id = 0; id < DIM; ++id)
sum += a[ia][id] * b[id][ib];
out[ia][ib] = sum;
}
return;
}
// --------------------------------------------------------
// compute:function to be accelerated in HW
template <typename T, int DIM>
void mmult_hw(T a[DIM][DIM], T b[DIM][DIM], T out[DIM][DIM])
{
int const FACTOR = DIM/2;
#pragma HLS INLINE
#pragma HLS array_partition variable=a block factor=16 dim=2
#pragma HLS array_partition variable=b block factor=16 dim=1
// matrix multiplication of a A*B matrix
L1:for (int ia = 0; ia < DIM; ++ia)
L2:for (int ib = 0; ib < DIM; ++ib)
{
#pragma HLS PIPELINE II=1
T sum = 0;
L3:for (int id = 0; id < DIM; ++id)
sum += a[ia][id] * b[id][ib];
out[ia][ib] = sum;
}
return;
}
// --------------------------------------------------------
// functions to insert and extract elements from an axi stream
// includes conversion to correct data type
// input from axi
template <typename T, int U, int TI, int TD>
T pop_stream(ap_axiu <sizeof(T)*8,U,TI,TD> const &e)
{
#pragma HLS INLINE
assert(sizeof(T) == sizeof(int));
union // 数据转换,利用union共享空间功能实现 int-32bit float-32bit
{
int ival;
T oval;
} converter;
converter.ival = e.data;
T ret = converter.oval;
// 这里没有什么作用,只是结构体中其它数据的赋值
volatile ap_uint<sizeof(T)> strb = e.strb;
volatile ap_uint<sizeof(T)> keep = e.keep;
volatile ap_uint<U> user = e.user;
volatile ap_uint<1> last = e.last;
volatile ap_uint<TI> id = e.id;
volatile ap_uint<TD> dest = e.dest;
return ret;
}
// Output to axis
template <typename T, int U, int TI, int TD>
ap_axiu <sizeof(T)*8,U,TI,TD> push_stream(T const &v, bool last = false)
{
#pragma HLS INLINE
ap_axiu<sizeof(T)*8,U,TI,TD> e;
assert(sizeof(T) == sizeof(int));
union
{
int oval;
T ival;
} converter;
converter.ival = v;
e.data = converter.oval;
// set it to sizeof(T) ones
e.strb = -1;
e.keep = 15; //e.strb;
e.user = 0;
e.last = last ? 1 : 0;
e.id = 0;
e.dest = 0;
return e;
}
// --------------------------------------------------------------------
// function to be accelerated in HW wrapped with AXI4-Stream interface
// top function for accelerator
template <typename T, int DIM, int SIZE, int U, int TI, int TD>
void wrapped_mmult_hw (
AXI_VAL in_stream[2*SIZE],
AXI_VAL out_stream[SIZE])
{
#pragma HLS INLINE
T a[DIM][DIM];
T b[DIM][DIM];
T out[DIM][DIM];
assert(sizeof(T)*8 == 32);
// input1:stream in first matrix
for(int i=0; i<DIM; i++)
for(int j=0; j<DIM; j++)
{
#pragma HLS PIPELINE II=1
int k = i*DIM+j;
a[i][j] = pop_stream<T,U,TI,TD>(in_stream[k]);
}
// input2:stream in second matrix
for(int i=0; i<DIM; i++)
for(int j=0; j<DIM; j++)
{
#pragma HLS PIPELINE II=1
int k = i*DIM+j+SIZE;
b[i][j] = pop_stream<T,U,TI,TD>(in_stream[k]);
}
// compute3:do HW multiplication
mmult_hw<T, DIM>(a,b,out);
// output4:stream out result matrix
for(int i=0; i<DIM; i++)
for(int j=0; j<DIM; j++)
{
#pragma HLS PIPELINE II=1
int k = i*DIM+j;
out_stream[k] = push_stream<T,U,TI,TD>(out[i][j],k == (SIZE-1));
}
return;
}
// test the functions
template <typename T, int DIM, int SIZE, int U, int TI, int TD>
int test_matrix_mult(void)
{
int i,j, err;
T matOp1[DIM][DIM];
T matOp2[DIM][DIM];
T matMult_sw[DIM][DIM];
T matMult_hw[DIM][DIM];
/** Matrix Initiation */
for(i = 0; i<DIM; i++)
for(j = 0; j<DIM; j++)
matOp1[i][j] = (float)(i+j);
for(i = 0; i<DIM; i++)
for(j = 0; j<DIM; j++)
matOp2[i][j] = (float)(i*j);
/** End of Initiation */
printf("DEBUGGING AXI4 STREAMING DATA TYPES!\r\n");
// prepare data for the DUT
AXI_VAL inp_stream[2*SIZE];
AXI_VAL out_stream[SIZE];
assert(sizeof(T)*8 == 32);
// stream in the first input matrix
for(int i=0; i<DIM; i++)
for(int j=0; j<DIM; j++)
{
int k = i*DIM+j;
inp_stream[k] = push_stream<T,U,TI,TD>(matOp1[i][j],0);
}
// stream in the second input matrix
for(int i=0; i<DIM; i++)
for(int j=0; j<DIM; j++)
{
int k = i*DIM+j;
inp_stream[k+SIZE] = push_stream<T,U,TI,TD>(matOp2[i][j],k == (SIZE-1));
}
//call the DUT
wrapped_mmult_hw<T, DIM, SIZE, U, TI, TD>(inp_stream, out_stream);
// extract the output matrix from the out stream
for(int i=0; i<DIM; i++)
for(int j=0; j<DIM; j++)
{
int k = i*DIM+j;
matMult_hw[i][j] = pop_stream<T,U,TI,TD>(out_stream[k]);
}
/* reference Matrix Multiplication */
matrix_multiply_ref<T, DIM>(matOp1, matOp2, matMult_sw);
/** Matrix comparison */
err = 0;
for (i = 0; (i<DIM && !err); i++)
for (j = 0; (j<DIM && !err); j++)
if (matMult_sw[i][j] != matMult_hw[i][j])
err++;
if (err == 0)
printf("Matrixes identical ... Test successful!\r\n");
else
printf("Test failed!\r\n");
return err;
}
#include <stdio.h>
#include <stdlib.h>
#include "mmult.h"
/*
* main-调用cpp文件中,直接比较multiply_sw和multiply_hw,验证multiply_hw是否正确
* debug:调用.h文件,自己产生数据,模拟pop-compute-push过程
* HLS_accel:需要设置输入输出端口,这里也是最终硬件的实现
*/
typedef float T;
int const DIM = 32;
int const SIZE = DIM*DIM;
// 普通矩阵乘
void mmult_sw(T a[DIM][DIM], T b[DIM][DIM], T out[DIM][DIM])
{
// matrix multiplication of a A*B matrix
for (int ia = 0; ia < DIM; ++ia)
for (int ib = 0; ib < DIM; ++ib)
{
float sum = 0;
for (int id = 0; id < DIM; ++id)
sum += a[ia][id] * b[id][ib];
out[ia][ib] = sum;
}
}
// 1.DEBUG调试实现
#ifdef DB_DEBUG
int main(void)
{
int ret_val = 0;
ret_val = test_matrix_mult<T, DIM, SIZE, 4,5,5>();
return ret_val;
}
#else
// 2.主函数测试
int main(void)
{
int ret_val = 0;
int i,j, err;
T matOp1[DIM][DIM];
T matOp2[DIM][DIM];
T matMult_sw[DIM][DIM];
T matMult_hw[DIM][DIM];
/** Matrix Initiation */
for(i = 0; i<DIM; i++)
for(j = 0; j<DIM; j++)
matOp1[i][j] = (float)(i+j);
for(i = 0; i<DIM; i++)
for(j = 0; j<DIM; j++)
matOp2[i][j] = (float)(i*j);
/** End of Initiation */
printf("NORMAL MODE\r\n");
standalone_mmult(matOp1, matOp2, matMult_hw);
/* reference Matrix Multiplication */
mmult_sw(matOp1, matOp2, matMult_sw);
/** Matrix comparison */
err = 0;
for (i = 0; (i<DIM && !err); i++)
for (j = 0; (j<DIM && !err); j++)
if (matMult_sw[i][j] != matMult_hw[i][j])
err++;
if (err == 0)
printf("Matrixes identical ... Test successful!\r\n");
else
printf("Test failed!\r\n");
return err;
}
#endif
三、Vivado
Vivado搭建的系统框图自动分配地址
四、SDK
// main.c
#include <stdio.h>
#include <stdlib.h>
#include "platform.h"
#include "xparameters.h"
#include "xtmrctr.h"
#include "xaxidma.h"
#include "lib_xmmult_hw.h"
#define NUM_TESTS 1024
#define XPAR_AXI_TIMER_DEVICE_ID (XPAR_AXI_TIMER_0_DEVICE_ID)
// TIMER Instance
XTmrCtr timer_dev;
// AXI DMA Instance
XAxiDma AxiDma;
int init_dma(){
XAxiDma_Config *CfgPtr;
int status;
CfgPtr = XAxiDma_LookupConfig( (XPAR_AXI_DMA_0_DEVICE_ID) );
if(!CfgPtr){
print("Error looking for AXI DMA config\n\r");
return XST_FAILURE;
}
status = XAxiDma_CfgInitialize(&AxiDma,CfgPtr);
if(status != XST_SUCCESS){
print("Error initializing DMA\n\r");
return XST_FAILURE;
}
//check for scatter gather mode
if(XAxiDma_HasSg(&AxiDma)){
print("Error DMA configured in SG mode\n\r");
return XST_FAILURE;
}
/* Disable interrupts, we use polling mode */
XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DEVICE_TO_DMA);
XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DMA_TO_DEVICE);
// Reset DMA
XAxiDma_Reset(&AxiDma);
while (!XAxiDma_ResetIsDone(&AxiDma)) {}
return XST_SUCCESS;
}
int main(int argc, char **argv)
{
int i, j, k;
int err=0;
int status;
int num_of_trials = 1;
float A[DIM][DIM];
float B[DIM][DIM];
float res_hw[DIM][DIM];
float res_sw[DIM][DIM];
unsigned int dma_size = SIZE * sizeof(float);
float acc_factor;
unsigned int init_time, curr_time, calibration;
unsigned int begin_time;
unsigned int end_time;
unsigned int run_time_sw = 0;
unsigned int run_time_hw = 0;
init_platform();
if (argc >= 2)
{
num_of_trials = atoi(argv[1]);
}
xil_printf("***************************************************************\n");
xil_printf(" FP 32x32 MATRIX MULT -> AXI DMA -> ARM ACP \n");
xil_printf(" XAPP1170 redesigned with Vivado + HLS + IP Integrator 2017.4 \n");
xil_printf("***************************************************************\n");
/* ******************************************************************************* */
// Init DMA
status = init_dma();
if(status != XST_SUCCESS){
print("\rError: DMA init failed\n");
return XST_FAILURE;
}
print("\nDMA Init done\n");
/* ******************************************************************************* */
// Setup HW timer
status = XTmrCtr_Initialize(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
if(status != XST_SUCCESS)
{
print("Error: timer setup failed\n");
return XST_FAILURE;
}
XTmrCtr_SetOptions(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID, XTC_ENABLE_ALL_OPTION);
// Calibrate HW timer
XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
init_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
curr_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
calibration = curr_time - init_time;
// Loop measurement
XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
begin_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
for (i = 0; i< NUM_TESTS; i++);
end_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
run_time_sw = end_time - begin_time - calibration;
xil_printf("Loop time for %d iterations is %d cycles\n", NUM_TESTS, run_time_sw);
/* ******************************************************************************* */
// input data Initiation
for(i = 0; i<DIM; i++)
for(j = 0; j<DIM; j++)
{
A[i][j] = (float)(i+j);
B[i][j] = (float)(i*j);
}
/** End of Initiation */
//for (k=0; k<num_of_trials; k++)
{
/* ******************************************************************************* */
// call the software version of the function
xil_printf("Running Matrix Mult in SW\n");
XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
begin_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
for (i = 0; i < NUM_TESTS; i++)
{
matrix_multiply_ref(A, B, res_sw);
}
end_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
run_time_sw = end_time - begin_time - calibration;
xil_printf("\nTotal run time for SW on Processor is %d cycles over %d tests.\n",
run_time_sw/NUM_TESTS, NUM_TESTS);
/* ******************************************************************************* */
// call the HW accelerator
XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
begin_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
// Setup the HW Accelerator
status = Setup_HW_Accelerator(A, B, res_hw, dma_size);
for (i = 0; i < NUM_TESTS; i++) {
status = Start_HW_Accelerator();
status = Run_HW_Accelerator(A, B, res_hw, dma_size);
}
end_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
run_time_hw = end_time - begin_time - calibration;
xil_printf(
"Total run time for AXI DMA + HW accelerator is %d cycles over %d tests\n",
run_time_hw/NUM_TESTS, NUM_TESTS);
/* ******************************************************************************* */
//Compare the results from sw and hw
for (i = 0; i < DIM; i++)
for (j = 0; j < DIM; j++)
if (res_sw[i][j] != res_hw[i][j]) {
err = 1;
}
// HW vs. SW speedup factor
acc_factor = (float) run_time_sw / (float) run_time_hw;
xil_printf("Acceleration factor: %d.%d \n\n",
(int) acc_factor, (int) (acc_factor * 1000) % 1000);
}
if (err == 0)
print("SW and HW results match!\n\n");
else
print("ERROR: results mismatch\n\n");
cleanup_platform();
return 0;
}
// lib_xmmult_hw.c
#include "platform.h"
#include "xparameters.h"
#include "xscugic.h"
#include "xaxidma.h"
//#include "xmmult_accel_core.h"
#include "xhls_accel.h"
#include "lib_xmmult_hw.h"
#include "xil_printf.h"
volatile static int RunExample = 0;
volatile static int ResultExample = 0;
XHls_accel xmmult_dev;
XHls_accel_Config xmmult_config = {
0,
XPAR_HLS_ACCEL_0_S_AXI_CONTROL_BUS_BASEADDR
};
//Interrupt Controller Instance
XScuGic ScuGic;
// AXI DMA Instance
extern XAxiDma AxiDma;
int XMmultSetup(){
return XHls_accel_CfgInitialize(&xmmult_dev,&xmmult_config);
}
void XMmultStart(void *InstancePtr){
XHls_accel *pExample = (XHls_accel *)InstancePtr;
XHls_accel_InterruptEnable(pExample,1);
XHls_accel_InterruptGlobalEnable(pExample);
XHls_accel_Start(pExample);
}
void XMmultIsr(void *InstancePtr){
XHls_accel *pExample = (XHls_accel *)InstancePtr;
//Disable the global interrupt
XHls_accel_InterruptGlobalDisable(pExample);
//Disable the local interrupt
XHls_accel_InterruptDisable(pExample,0xffffffff);
// clear the local interrupt
XHls_accel_InterruptClear(pExample,1);
ResultExample = 1;
// restart the core if it should run again
if(RunExample){
XMmultStart(pExample);
}
}
int XMmultSetupInterrupt()
{
//This functions sets up the interrupt on the ARM
int result;
XScuGic_Config *pCfg = XScuGic_LookupConfig(XPAR_SCUGIC_SINGLE_DEVICE_ID);
if (pCfg == NULL){
print("Interrupt Configuration Lookup Failed\n\r");
return XST_FAILURE;
}
result = XScuGic_CfgInitialize(&ScuGic,pCfg,pCfg->CpuBaseAddress);
if(result != XST_SUCCESS){
return result;
}
// self test
result = XScuGic_SelfTest(&ScuGic);
if(result != XST_SUCCESS){
return result;
}
// Initialize the exception handler
Xil_ExceptionInit();
// Register the exception handler
//print("Register the exception handler\n\r");
Xil_ExceptionRegisterHandler(XIL_EXCEPTION_ID_INT,(Xil_ExceptionHandler)XScuGic_InterruptHandler,&ScuGic);
//Enable the exception handler
Xil_ExceptionEnable();
// Connect the Adder ISR to the exception table
//print("Connect the Adder ISR to the Exception handler table\n\r");
result = XScuGic_Connect(&ScuGic,XPAR_FABRIC_HLS_ACCEL_0_INTERRUPT_INTR,(Xil_InterruptHandler)XMmultIsr,&xmmult_dev);
if(result != XST_SUCCESS){
return result;
}
//print("Enable the Adder ISR\n\r");
XScuGic_Enable(&ScuGic,XPAR_FABRIC_HLS_ACCEL_0_INTERRUPT_INTR);
return XST_SUCCESS;
}
int Setup_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size)
//Setup the Vivado HLS Block
{
int status = XMmultSetup();
if(status != XST_SUCCESS){
print("Error: example setup failed\n");
return XST_FAILURE;
}
status = XMmultSetupInterrupt();
if(status != XST_SUCCESS){
print("Error: interrupt setup failed\n");
return XST_FAILURE;
}
//XHls_accel_core_SetVal1(&xmmult_dev,val1);
//XHls_accel_core_SetVal2(&xmmult_dev,val2);
XMmultStart(&xmmult_dev);
//flush the cache
Xil_DCacheFlushRange((unsigned int)A,dma_size);
Xil_DCacheFlushRange((unsigned int)B,dma_size);
Xil_DCacheFlushRange((unsigned int)res_hw,dma_size);
print("\rCache cleared\n\r");
return 0;
}
void matrix_multiply_ref(float a[DIM][DIM], float b[DIM][DIM], float out[DIM][DIM])
{
int ia, ib, id;
// matrix multiplication of a A*B matrix
for (ia = 0; ia < DIM; ++ia)
for (ib = 0; ib < DIM; ++ib)
{
float sum = 0;
for (id = 0; id < DIM; ++id)
sum += a[ia][id] * b[id][ib];
out[ia][ib] = sum;
}
}
int Start_HW_Accelerator(void)
{
int status = XMmultSetup();
if(status != XST_SUCCESS){
print("Error: example setup failed\n");
return XST_FAILURE;
}
status = XMmultSetupInterrupt();
if(status != XST_SUCCESS){
print("Error: interrupt setup failed\n");
return XST_FAILURE;
}
//XHls_accel_core_SetVal1(&xmmult_dev,val1);
//XHls_accel_core_SetVal2(&xmmult_dev,val2);
XMmultStart(&xmmult_dev);
return 0;
}
// DMA采用了轮询方式判断是否结束
int Run_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size)
{
//transfer A to the Vivado HLS block
int status = XAxiDma_SimpleTransfer(&AxiDma, (unsigned int) A, dma_size, XAXIDMA_DMA_TO_DEVICE);
if (status != XST_SUCCESS) {
//print("Error: DMA transfer to Vivado HLS block failed\n");
return XST_FAILURE;
}
/* Wait for transfer to be done */
while (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE)) ;
//transfer B to the Vivado HLS block
status = XAxiDma_SimpleTransfer(&AxiDma, (unsigned int) B, dma_size, XAXIDMA_DMA_TO_DEVICE);
if (status != XST_SUCCESS) {
//print("Error: DMA transfer to Vivado HLS block failed\n");
return XST_FAILURE;
}
/* Wait for transfer to be done */
while (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE)) ;
//get results from the Vivado HLS block
status = XAxiDma_SimpleTransfer(&AxiDma, (unsigned int) res_hw, dma_size,
XAXIDMA_DEVICE_TO_DMA);
if (status != XST_SUCCESS) {
return XST_FAILURE;
}
/* Wait for transfer to be done */
while (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE)) ;
while ((XAxiDma_Busy(&AxiDma, XAXIDMA_DEVICE_TO_DMA)) || (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE))) ;
return 0;
}
// lib_xmmult_hw.h
#ifndef H_LIB_EXAMPLE_HW_H
#define H_LIB_EXAMPLE_HW_H
#define DIM 32
#define SIZE ((DIM)*(DIM))
int Setup_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size);
int Run_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size);
int Start_HW_Accelerator(void);
void matrix_multiply_ref(float a[DIM][DIM], float b[DIM][DIM], float out[DIM][DIM]);
#endif
最终实现的结果是
参考资料:Xilinx官方文档—A Zynq Accelerator for Floating Point Matrix Multiplication Designed with Vivado HLS.