Zedboard通过DMA方式实现PL和PS端通信—矩阵乘加速

最新推荐文章于 2024-06-10 11:02:15 发布

happyday_gyx

最新推荐文章于 2024-06-10 11:02:15 发布

阅读量1.3k

点赞数 2

分类专栏： HLS Zedboard 文章标签： hls fpga

本文链接：https://blog.csdn.net/qq_37719487/article/details/113060687

版权

Zedboard 同时被 2 个专栏收录

8 篇文章 11 订阅

订阅专栏

HLS

6 篇文章 6 订阅

订阅专栏

本文是一个利用DMA通信的例子，主要参考官方文档实现。软件的操作步骤可以参考前面文章：链接

一、Overview

Zedboard开发板由PS（Processing System）和PL（Programmable Logic）两部分构成，PS部分由两个ARM CPU核构成，PL部分是FPGA可编程逻辑。本文通过DMA（Direct Memory Access）方式实现两部分的通信，对矩阵乘应用进行加速。系统架构图如下所示，左侧是PS端，右端是PL端。控制信号通过PS端的GP0接口与PL端的IP核交互（AXI-lite总线），读写数据通过PS端的ACP接口与PL端进行交互（AXI-Stream总线）。DMA和Timer是利用Vivado中固有IP核实现，HLS IP core是利用Vivado HLS编写的矩阵乘IP核。

Zynq
基本参数
工具版本：2017.4
实现功能：矩阵乘（float类型）
传输方式：DMA
传输总线：AXI-stream

二、HLS

HLS部分的代码文件如下。参考上一篇文章的步骤，直接综合、生成IP核即可。

// mmult_accel.cpp
#include <stdio.h>
#include <stdlib.h>
#include "mmult.h"

#define MCR_SIZE 1024

void standalone_mmult (float A[32][32], float B[32][32], float C[32][32])
{
	mmult_hw <float, 32>(A, B, C);
}

void HLS_accel (AXI_VAL INPUT_STREAM[2*MCR_SIZE], AXI_VAL OUTPUT_STREAM[MCR_SIZE])
{
#pragma HLS INTERFACE s_axilite port=return     bundle=CONTROL_BUS
#pragma HLS INTERFACE axis      port=OUTPUT_STREAM
#pragma HLS INTERFACE axis      port=INPUT_STREAM

	wrapped_mmult_hw <float, 32, 32*32, 4, 5, 5>(INPUT_STREAM, OUTPUT_STREAM);
	return;
}

// mmult.h
#include <assert.h>
#include <ap_axi_sdata.h>

#define MCR_SIZE 1024
typedef ap_axiu<32,4,5,5> AXI_VAL;  // 定义AXI-stream的副通道类型

// function prototypes
void standalone_mmult (float A[32][32], float B[32][32], float C[32][32]);
void HLS_accel (AXI_VAL in_stream[2*MCR_SIZE], AXI_VAL out_stream[MCR_SIZE]);



/* ****************************** C++ TEMPLATES ************************************** */
// reference compute: function—multiply
template <typename T, int DIM>
void matrix_multiply_ref(T a[DIM][DIM], T b[DIM][DIM], T out[DIM][DIM])
{
	// matrix multiplication of a A*B matrix
	for (int ia = 0; ia < DIM; ++ia)
		for (int ib = 0; ib < DIM; ++ib)
		{
			float sum = 0;
			for (int id = 0; id < DIM; ++id)
				sum += a[ia][id] * b[id][ib];
			out[ia][ib] = sum;
		}
		return;
}

// --------------------------------------------------------
// compute：function to be accelerated in HW
template <typename T, int DIM>
void mmult_hw(T a[DIM][DIM], T b[DIM][DIM], T out[DIM][DIM])
{
	int const FACTOR = DIM/2;
#pragma HLS INLINE
	#pragma HLS array_partition variable=a block factor=16 dim=2
	#pragma HLS array_partition variable=b block factor=16 dim=1

	// matrix multiplication of a A*B matrix
	L1:for (int ia = 0; ia < DIM; ++ia)
		L2:for (int ib = 0; ib < DIM; ++ib)
		{
			#pragma HLS PIPELINE II=1
			T sum = 0;
			L3:for (int id = 0; id < DIM; ++id)
				sum += a[ia][id] * b[id][ib];
			out[ia][ib] = sum;
		}
		return;
}

// --------------------------------------------------------
// functions to insert and extract elements from an axi stream
// includes conversion to correct data type
// input from axi
template <typename T, int U, int TI, int TD>
T pop_stream(ap_axiu <sizeof(T)*8,U,TI,TD> const &e)
{
#pragma HLS INLINE
	assert(sizeof(T) == sizeof(int));
	union           // 数据转换，利用union共享空间功能实现  int-32bit float-32bit
	{
		int ival;
		T oval;
	} converter;
	converter.ival = e.data;
	T ret = converter.oval;

	// 这里没有什么作用，只是结构体中其它数据的赋值
	volatile ap_uint<sizeof(T)> strb = e.strb;
	volatile ap_uint<sizeof(T)> keep = e.keep;
	volatile ap_uint<U> user = e.user;
	volatile ap_uint<1> last = e.last;
	volatile ap_uint<TI> id = e.id;
	volatile ap_uint<TD> dest = e.dest;

	return ret;
}

// Output to axis
template <typename T, int U, int TI, int TD>
ap_axiu <sizeof(T)*8,U,TI,TD> push_stream(T const &v, bool last = false)
{
#pragma HLS INLINE
	ap_axiu<sizeof(T)*8,U,TI,TD> e;

	assert(sizeof(T) == sizeof(int));
	union
	{
		int oval;
		T ival;
	} converter;
	converter.ival = v;
	e.data = converter.oval;

	// set it to sizeof(T) ones
	e.strb = -1;
	e.keep = 15; //e.strb;
	e.user = 0;
	e.last = last ? 1 : 0;
	e.id = 0;
	e.dest = 0;

	return e;
}

// --------------------------------------------------------------------
// function to be accelerated in HW wrapped with AXI4-Stream interface
// top function for accelerator
template <typename T, int DIM, int SIZE, int U, int TI, int TD>
void wrapped_mmult_hw (
	AXI_VAL in_stream[2*SIZE],
	AXI_VAL out_stream[SIZE])
{

#pragma HLS INLINE

	T a[DIM][DIM];
	T b[DIM][DIM];
	T out[DIM][DIM];

	assert(sizeof(T)*8 == 32);

	// input1：stream in first matrix
	for(int i=0; i<DIM; i++)
		for(int j=0; j<DIM; j++)
		{
#pragma HLS PIPELINE II=1
			int k = i*DIM+j;
			a[i][j] = pop_stream<T,U,TI,TD>(in_stream[k]);
		}

    // input2：stream in second matrix
	for(int i=0; i<DIM; i++)
		for(int j=0; j<DIM; j++)
		{
#pragma HLS PIPELINE II=1
			int k = i*DIM+j+SIZE;
			b[i][j] = pop_stream<T,U,TI,TD>(in_stream[k]);
		}

	// compute3：do HW multiplication
	mmult_hw<T, DIM>(a,b,out);

	// output4：stream out result matrix
	for(int i=0; i<DIM; i++)
		for(int j=0; j<DIM; j++)
		{
#pragma HLS PIPELINE II=1
			int k = i*DIM+j;
			out_stream[k] = push_stream<T,U,TI,TD>(out[i][j],k == (SIZE-1));
		}

	return;
}

// test the functions
template <typename T, int DIM, int SIZE, int U, int TI, int TD>
int test_matrix_mult(void)
{
	int i,j, err;

	T matOp1[DIM][DIM];
	T matOp2[DIM][DIM];
	T matMult_sw[DIM][DIM];
	T matMult_hw[DIM][DIM];

	/** Matrix Initiation */
	for(i = 0; i<DIM; i++)
		for(j = 0; j<DIM; j++)
			matOp1[i][j] = (float)(i+j);

	for(i = 0; i<DIM; i++)
		for(j = 0; j<DIM; j++)
			matOp2[i][j] = (float)(i*j);
	/** End of Initiation */


	printf("DEBUGGING AXI4 STREAMING DATA TYPES!\r\n");

	// prepare data for the DUT
	AXI_VAL inp_stream[2*SIZE];
	AXI_VAL out_stream[SIZE];

	assert(sizeof(T)*8 == 32);
	// stream in the first input  matrix
	for(int i=0; i<DIM; i++)
		for(int j=0; j<DIM; j++)
		{
			int k = i*DIM+j;
			inp_stream[k]      = push_stream<T,U,TI,TD>(matOp1[i][j],0);
		}
	// stream in the second input  matrix
	for(int i=0; i<DIM; i++)
		for(int j=0; j<DIM; j++)
		{
			int k = i*DIM+j;
			inp_stream[k+SIZE] = push_stream<T,U,TI,TD>(matOp2[i][j],k == (SIZE-1));
		}


	//call the DUT
	wrapped_mmult_hw<T, DIM, SIZE, U, TI, TD>(inp_stream, out_stream);

	// extract the output matrix from the out stream
	for(int i=0; i<DIM; i++)
		for(int j=0; j<DIM; j++)
		{
			int k = i*DIM+j;
			matMult_hw[i][j] = pop_stream<T,U,TI,TD>(out_stream[k]);
		}


	/* reference Matrix Multiplication */
	matrix_multiply_ref<T, DIM>(matOp1, matOp2, matMult_sw);

	/** Matrix comparison */
	err = 0;
	for (i = 0; (i<DIM && !err); i++)
		for (j = 0; (j<DIM && !err); j++)
			if (matMult_sw[i][j] != matMult_hw[i][j]) 
				err++;

	if (err == 0)
		printf("Matrixes identical ... Test successful!\r\n");
	else
		printf("Test failed!\r\n");

	return err;
}

#include <stdio.h>
#include <stdlib.h>
#include "mmult.h"

/*
 * main-调用cpp文件中，直接比较multiply_sw和multiply_hw，验证multiply_hw是否正确
 * debug:调用.h文件，自己产生数据，模拟pop-compute-push过程
 * HLS_accel：需要设置输入输出端口，这里也是最终硬件的实现
 */

typedef float T;
int const DIM = 32;
int const SIZE = DIM*DIM;

// 普通矩阵乘
void mmult_sw(T a[DIM][DIM], T b[DIM][DIM], T out[DIM][DIM])
{
	// matrix multiplication of a A*B matrix
	for (int ia = 0; ia < DIM; ++ia)
		for (int ib = 0; ib < DIM; ++ib)
		{
			float sum = 0;
			for (int id = 0; id < DIM; ++id)
				sum += a[ia][id] * b[id][ib];
			out[ia][ib] = sum;
		}
}

// 1.DEBUG调试实现
#ifdef DB_DEBUG
int main(void)
{

	int ret_val = 0;

	ret_val = test_matrix_mult<T, DIM, SIZE, 4,5,5>();

	return ret_val;

}
#else

// 2.主函数测试
int main(void)
{
	int ret_val = 0;
	int i,j, err;

	T matOp1[DIM][DIM];
	T matOp2[DIM][DIM];
	T matMult_sw[DIM][DIM];
	T matMult_hw[DIM][DIM];

	/** Matrix Initiation */
	for(i = 0; i<DIM; i++)
		for(j = 0; j<DIM; j++)
			matOp1[i][j] = (float)(i+j);

	for(i = 0; i<DIM; i++)
		for(j = 0; j<DIM; j++)
			matOp2[i][j] = (float)(i*j);
	/** End of Initiation */

	printf("NORMAL MODE\r\n");
	standalone_mmult(matOp1, matOp2, matMult_hw);

	/* reference Matrix Multiplication */
	mmult_sw(matOp1, matOp2, matMult_sw);

	/** Matrix comparison */
	err = 0;
	for (i = 0; (i<DIM && !err); i++)
		for (j = 0; (j<DIM && !err); j++)
			if (matMult_sw[i][j] != matMult_hw[i][j])
				err++;

	if (err == 0)
		printf("Matrixes identical ... Test successful!\r\n");
	else
		printf("Test failed!\r\n");

	return err;
}
#endif

三、Vivado

Vivado搭建的系统框图

在这里插入图片描述

自动分配地址

在这里插入图片描述

四、SDK

// main.c
#include <stdio.h>
#include <stdlib.h>
#include "platform.h"
#include "xparameters.h"
#include "xtmrctr.h"
#include "xaxidma.h"
#include "lib_xmmult_hw.h"

#define NUM_TESTS 1024

#define XPAR_AXI_TIMER_DEVICE_ID 		(XPAR_AXI_TIMER_0_DEVICE_ID) 

// TIMER Instance
XTmrCtr timer_dev;

// AXI DMA Instance
XAxiDma AxiDma;

int init_dma(){
	XAxiDma_Config *CfgPtr;
	int status;

	CfgPtr = XAxiDma_LookupConfig( (XPAR_AXI_DMA_0_DEVICE_ID) );
	if(!CfgPtr){
		print("Error looking for AXI DMA config\n\r");
		return XST_FAILURE;
	}
	status = XAxiDma_CfgInitialize(&AxiDma,CfgPtr);
	if(status != XST_SUCCESS){
		print("Error initializing DMA\n\r");
		return XST_FAILURE;
	}
	//check for scatter gather mode
	if(XAxiDma_HasSg(&AxiDma)){
		print("Error DMA configured in SG mode\n\r");
		return XST_FAILURE;
	}
	/* Disable interrupts, we use polling mode */
	XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DEVICE_TO_DMA);
	XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DMA_TO_DEVICE);

	// Reset DMA
	XAxiDma_Reset(&AxiDma);
	while (!XAxiDma_ResetIsDone(&AxiDma)) {}

	return XST_SUCCESS;
}


int main(int argc, char **argv)
{
	int i, j, k;
	int err=0;
	int status;
	int num_of_trials = 1;
	float A[DIM][DIM];
	float B[DIM][DIM];
	float res_hw[DIM][DIM];
	float res_sw[DIM][DIM];

	unsigned int dma_size = SIZE * sizeof(float);

    float acc_factor;
	unsigned int init_time, curr_time, calibration;
	unsigned int begin_time;
	unsigned int end_time;
	unsigned int run_time_sw = 0;
	unsigned int run_time_hw = 0;

	init_platform();

	if (argc >= 2)
	{
		num_of_trials = atoi(argv[1]);
	}

	xil_printf("***************************************************************\n");
	xil_printf("  FP 32x32 MATRIX MULT -> AXI DMA -> ARM ACP                   \n");
	xil_printf("  XAPP1170 redesigned with Vivado + HLS + IP Integrator 2017.4 \n");
	xil_printf("***************************************************************\n");

	/* ******************************************************************************* */
	// Init DMA
	status = init_dma();
	if(status != XST_SUCCESS){
		print("\rError: DMA init failed\n");
		return XST_FAILURE;
	}
	print("\nDMA Init done\n");

	/* ******************************************************************************* */
	// Setup HW timer
	status = XTmrCtr_Initialize(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
	if(status != XST_SUCCESS)
	{
		print("Error: timer setup failed\n");
		return XST_FAILURE;
	}
	XTmrCtr_SetOptions(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID, XTC_ENABLE_ALL_OPTION);

	// Calibrate HW timer
	XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
	init_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
	curr_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
	calibration = curr_time - init_time;

	// Loop measurement
	XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
	begin_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
	for (i = 0; i< NUM_TESTS; i++);
	end_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
	run_time_sw = end_time - begin_time - calibration;
	xil_printf("Loop time for %d iterations is %d cycles\n", NUM_TESTS, run_time_sw);

	/* ******************************************************************************* */
	// input data Initiation
	for(i = 0; i<DIM; i++)
		for(j = 0; j<DIM; j++)
		{
			A[i][j] = (float)(i+j);
			B[i][j] = (float)(i*j);
		}
	/** End of Initiation */


	//for (k=0; k<num_of_trials; k++)
	{

		/* ******************************************************************************* */
		// call the software version of the function
		xil_printf("Running Matrix Mult in SW\n");
		XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
		begin_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
		for (i = 0; i < NUM_TESTS; i++)
		{
			matrix_multiply_ref(A, B, res_sw);
		}
		end_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
		run_time_sw = end_time - begin_time - calibration;
		xil_printf("\nTotal run time for SW on Processor is %d cycles over %d tests.\n",
				run_time_sw/NUM_TESTS, NUM_TESTS);

		/* ******************************************************************************* */
		// call the HW accelerator
		XTmrCtr_Reset(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
		begin_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
		// Setup the HW Accelerator
		status = Setup_HW_Accelerator(A, B, res_hw, dma_size);
		for (i = 0; i < NUM_TESTS; i++) {
			status = Start_HW_Accelerator();
			status = Run_HW_Accelerator(A, B, res_hw, dma_size);
		}
		end_time = XTmrCtr_GetValue(&timer_dev, XPAR_AXI_TIMER_DEVICE_ID);
		run_time_hw = end_time - begin_time - calibration;
		xil_printf(
				"Total run time for AXI DMA + HW accelerator is %d cycles over %d tests\n",
				run_time_hw/NUM_TESTS, NUM_TESTS);

		/* ******************************************************************************* */
		//Compare the results from sw and hw
		for (i = 0; i < DIM; i++)
			for (j = 0; j < DIM; j++)
				if (res_sw[i][j] != res_hw[i][j]) {
					err = 1;
				}

		// HW vs. SW speedup factor
		acc_factor = (float) run_time_sw / (float) run_time_hw;
		xil_printf("Acceleration factor: %d.%d \n\n",
				(int) acc_factor, (int) (acc_factor * 1000) % 1000);

	}
	
	if (err == 0)
		print("SW and HW results match!\n\n");
	else
		print("ERROR: results mismatch\n\n");

    cleanup_platform();
    return 0;
}

// lib_xmmult_hw.c
#include "platform.h"
#include "xparameters.h"
#include "xscugic.h"
#include "xaxidma.h"
//#include "xmmult_accel_core.h"
#include "xhls_accel.h"
#include "lib_xmmult_hw.h"
#include "xil_printf.h"


volatile static int RunExample = 0;
volatile static int ResultExample = 0;

XHls_accel xmmult_dev;

XHls_accel_Config xmmult_config = {
	0,
	XPAR_HLS_ACCEL_0_S_AXI_CONTROL_BUS_BASEADDR
};

//Interrupt Controller Instance
XScuGic ScuGic;

// AXI DMA Instance
extern XAxiDma AxiDma;


int XMmultSetup(){
	return XHls_accel_CfgInitialize(&xmmult_dev,&xmmult_config);
}

void XMmultStart(void *InstancePtr){
	XHls_accel *pExample = (XHls_accel *)InstancePtr;
	XHls_accel_InterruptEnable(pExample,1);
	XHls_accel_InterruptGlobalEnable(pExample);
	XHls_accel_Start(pExample);
}


void XMmultIsr(void *InstancePtr){
	XHls_accel *pExample = (XHls_accel *)InstancePtr;

	//Disable the global interrupt
	XHls_accel_InterruptGlobalDisable(pExample);
	//Disable the local interrupt
	XHls_accel_InterruptDisable(pExample,0xffffffff);

	// clear the local interrupt
	XHls_accel_InterruptClear(pExample,1);

	ResultExample = 1;
	// restart the core if it should run again
	if(RunExample){
		XMmultStart(pExample);
	}
}

int XMmultSetupInterrupt()
{
	//This functions sets up the interrupt on the ARM
	int result;
	XScuGic_Config *pCfg = XScuGic_LookupConfig(XPAR_SCUGIC_SINGLE_DEVICE_ID);
	if (pCfg == NULL){
		print("Interrupt Configuration Lookup Failed\n\r");
		return XST_FAILURE;
	}
	result = XScuGic_CfgInitialize(&ScuGic,pCfg,pCfg->CpuBaseAddress);
	if(result != XST_SUCCESS){
		return result;
	}
	// self test
	result = XScuGic_SelfTest(&ScuGic);
	if(result != XST_SUCCESS){
		return result;
	}
	// Initialize the exception handler
	Xil_ExceptionInit();
	// Register the exception handler
	//print("Register the exception handler\n\r");
	Xil_ExceptionRegisterHandler(XIL_EXCEPTION_ID_INT,(Xil_ExceptionHandler)XScuGic_InterruptHandler,&ScuGic);
	//Enable the exception handler
	Xil_ExceptionEnable();
	// Connect the Adder ISR to the exception table
	//print("Connect the Adder ISR to the Exception handler table\n\r");
	result = XScuGic_Connect(&ScuGic,XPAR_FABRIC_HLS_ACCEL_0_INTERRUPT_INTR,(Xil_InterruptHandler)XMmultIsr,&xmmult_dev);
	if(result != XST_SUCCESS){
		return result;
	}
	//print("Enable the Adder ISR\n\r");
	XScuGic_Enable(&ScuGic,XPAR_FABRIC_HLS_ACCEL_0_INTERRUPT_INTR);
	return XST_SUCCESS;
}

int Setup_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size)
//Setup the Vivado HLS Block
{
	int status = XMmultSetup();
	if(status != XST_SUCCESS){
		print("Error: example setup failed\n");
		return XST_FAILURE;
	}
	status =  XMmultSetupInterrupt();
	if(status != XST_SUCCESS){
		print("Error: interrupt setup failed\n");
		return XST_FAILURE;
	}
	//XHls_accel_core_SetVal1(&xmmult_dev,val1);
	//XHls_accel_core_SetVal2(&xmmult_dev,val2);
	XMmultStart(&xmmult_dev);

	//flush the cache
	Xil_DCacheFlushRange((unsigned int)A,dma_size);
	Xil_DCacheFlushRange((unsigned int)B,dma_size);
	Xil_DCacheFlushRange((unsigned int)res_hw,dma_size);
	print("\rCache cleared\n\r");

	return 0;
}


void matrix_multiply_ref(float a[DIM][DIM], float b[DIM][DIM], float out[DIM][DIM])
{

	int ia, ib, id;

  // matrix multiplication of a A*B matrix
  for (ia = 0; ia < DIM; ++ia)
     for (ib = 0; ib < DIM; ++ib)
     {
    	 float sum = 0;
		 for (id = 0; id < DIM; ++id)
			 sum += a[ia][id] * b[id][ib];
		 out[ia][ib] = sum;
     }
}


int Start_HW_Accelerator(void)
{
	int status = XMmultSetup();
	if(status != XST_SUCCESS){
		print("Error: example setup failed\n");
		return XST_FAILURE;
	}
	status =  XMmultSetupInterrupt();
	if(status != XST_SUCCESS){
		print("Error: interrupt setup failed\n");
		return XST_FAILURE;
	}
	//XHls_accel_core_SetVal1(&xmmult_dev,val1);
	//XHls_accel_core_SetVal2(&xmmult_dev,val2);
	XMmultStart(&xmmult_dev);

	return 0;
}

// DMA采用了轮询方式判断是否结束
int Run_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size)
{
	//transfer A to the Vivado HLS block
	int status = XAxiDma_SimpleTransfer(&AxiDma, (unsigned int) A, dma_size, XAXIDMA_DMA_TO_DEVICE);
	if (status != XST_SUCCESS) {
		//print("Error: DMA transfer to Vivado HLS block failed\n");
		return XST_FAILURE;
	}
	/* Wait for transfer to be done */
	while (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE)) ;

	//transfer B to the Vivado HLS block
	status = XAxiDma_SimpleTransfer(&AxiDma, (unsigned int) B, dma_size, XAXIDMA_DMA_TO_DEVICE);
	if (status != XST_SUCCESS) {
		//print("Error: DMA transfer to Vivado HLS block failed\n");
		return XST_FAILURE;
	}
	/* Wait for transfer to be done */
	while (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE)) ;

	//get results from the Vivado HLS block
	status = XAxiDma_SimpleTransfer(&AxiDma, (unsigned int) res_hw, dma_size,
			XAXIDMA_DEVICE_TO_DMA);
	if (status != XST_SUCCESS) {
		return XST_FAILURE;
	}
	/* Wait for transfer to be done */
	while (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE)) ;
	while ((XAxiDma_Busy(&AxiDma, XAXIDMA_DEVICE_TO_DMA)) || (XAxiDma_Busy(&AxiDma, XAXIDMA_DMA_TO_DEVICE))) ;
	return 0;

}

// lib_xmmult_hw.h
#ifndef H_LIB_EXAMPLE_HW_H
#define H_LIB_EXAMPLE_HW_H

#define DIM    32
#define SIZE  ((DIM)*(DIM))

int Setup_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size);

int Run_HW_Accelerator(float A[DIM][DIM], float B[DIM][DIM], float res_hw[DIM][DIM], int dma_size);

int Start_HW_Accelerator(void);

void matrix_multiply_ref(float a[DIM][DIM], float b[DIM][DIM], float out[DIM][DIM]);

#endif

最终实现的结果是

在这里插入图片描述
参考资料：Xilinx官方文档—A Zynq Accelerator for Floating Point Matrix Multiplication Designed with Vivado HLS.

happyday_gyx

关注

2
点赞
踩
12

收藏

觉得还不错? 一键收藏
0
评论
Zedboard通过DMA方式实现PL和PS端通信—矩阵乘加速

[具体操作步骤可以参考前一篇文章](https://blog.csdn.net/qq_37719487/article/details/112983308)一、OverviewZedboard开发板包含两部分PS和PL，PS部分包含双核ARM处理器，PL部分是FPGA编程逻辑。本文利用DMA传输方式实现两部分的交互。架构图如下所示，左侧是PS端（Processing System），右端是PL端（Programmable Logic）。控制信号是通过PS端的GP0接口与...
复制链接

扫一扫

专栏目录