1、由于Xilinx软件自身问题,在2022年以后,版本时间溢出导致VHLS不能生成IP核心,之前说的是修改系统时间,现在Xilinx有补丁了,只要把补丁“automg_patch_20220104.tcl”这个文件,移动到“xxx/xxx/Vivado/版本号/common/scripts”例如“D:\vivado\Vivado\2019.1\common\scripts”文件下即可。下面是下载连接,不需要积分哦!
2、关于很多同学问我的矩阵求逆问题,我在这里回答一下,为什么硬件电路不适合采用分解法求逆
由于采用分解发,不可避免的需要进行递推运算,比如进行LU分解,分解LU矩阵时,LU矩阵内的每个元素对应的公式运算均不一致,而且,最后求逆用的还是高斯法求逆,仍然不能避免递推运算。由于Verilog、VHDL代码映射的时电路,因此不能像C/C++那样可以动态分配内存(动态改变电路结构)因此即便3-4-5维度的分解代码写出来,如果给你一个10阶的矩阵呢?完全不能人工直接书写Verilog、VHDL代码。因此有两种解决办法:①采用我的博客提出的公式高效堆叠法(并行速度极快、但是消耗资源多)②Vivado HLS(速度慢,对于小维度的矩阵没有质的提升,代码可读性差但消耗资源少,方便书写)
3、本人采用Cholesky分解法对正定矩阵进行了求逆实验,可以观察到分解结果,求逆结果与相乘的结果,如有需要亲联系作者。
4、采用VHLS工具,开展求解10维度的矩阵求逆实验
实验平台:ZCU106、Vivado 2019.1
实验过程:
①建立matrix_inv工程
②编写求逆inv.cpp文件以及testbench的main.cpp文件
③按顺序点击这四个按钮
第一个按钮是,C代码测试
第二个按钮是,综合C代码
第三个按钮是,联合仿真,可以输出波形
第四个按钮,输出IP核心。输出失败请参考第一部分。
上面就是IP的接口描述了。我们在Vivado里直接调用就行。
④建立Vivado工程,将IP核导入到工程,然后编写Main.V将结果存到RAM中。
上图为工程目录,下图给出Main.v函数,底层有需要请联系作者
`timescale 1ns / 1ps
//
// Company: 东北电力大学
// Engineer: Yang Zheng
//
// Create Date: 2022/05/01 15:49:28
// Design Name:
// Module Name: Main
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//
module Main(
input clk_125_p,
input clk_125_n,
input start
);
parameter AUTOTB_TRANSACTION_NUM = 1;
wire clk_100m_i;
reg ce = 1'b1;
wire AESL_start;
wire AESL_reset;
wire AESL_ce;
wire AESL_ready;
wire AESL_idle;
wire AESL_done;
reg AESL_done_delay = 0;
reg AESL_done_delay2 = 0;
reg AESL_ready_delay = 0;
wire ready;
wire ready_wire;
wire ap_start;
wire ap_done;
wire ap_idle;
wire ap_ready;
wire [6 : 0] x_address0;
wire x_ce0;
wire [31 : 0] x_q0;
wire [6 : 0] y_address0;
wire y_ce0;
wire y_we0;
wire [31 : 0] y_d0;
wire [31 : 0] y_q0;
integer done_cnt = 0;
integer ready_cnt = 0;
reg interface_done = 0;
wire ap_clk;
wire ap_rst;
wire ap_rst_n;
//时钟
clk_wiz_0 uut_clk_wiz_0(
.clk_out1(clk_100m_i),//
.clk_out2(clk_10m_i),//
.reset(1'b0),
.locked(dcm_locked),//信号平稳后置1
.clk_in1_p(clk_125_p),
.clk_in1_n(clk_125_n)
);
invNxN_0 uut_invNxN_0(
.x_ce0(x_ce0),//output 1
.y_ce0(y_ce0),//output 1
.y_we0(y_we0),//output 1
.ap_clk(ap_clk),//input 1
.ap_rst(ap_rst),//input 1 高电平复位
.ap_start(ap_start),//input 1 计算开始
.ap_done(ap_done),//output 1 计算完毕置1
.ap_idle(ap_idle),//output 1 是否空闲,计算一开始就不空闲
.ap_ready(ap_ready),//output 1
.x_address0(x_address0),//output 7 1-64,共计100个地址
.x_q0(x_q0),//input 32
.y_address0(y_address0),//output 7
.y_d0(y_d0),//output 32
.y_q0(y_q0)//input 32
);
// Assignment for control signal
assign ap_clk = clk_100m_i;
assign ap_rst = AESL_reset;
assign ap_rst_n = ~AESL_reset;
assign AESL_reset = ~dcm_locked;
assign ap_start = AESL_start;
assign AESL_start = start;
assign AESL_done = ap_done;
assign AESL_idle = ap_idle;
assign AESL_ready = ap_ready;
assign AESL_ce = ce;
//------------------------arrayx Instantiation--------------
// The input and output of arrayx
wire arrayx_ce0, arrayx_ce1;
wire arrayx_we0, arrayx_we1;
wire [6 : 0] arrayx_address0, arrayx_address1;
wire [31 : 0] arrayx_din0, arrayx_din1;
wire [31 : 0] arrayx_dout0, arrayx_dout1;
wire arrayx_ready;
wire arrayx_done;
mem_x uut_MEM_x(
.clk (clk_100m_i),
.rst (AESL_reset),
.ce0 (arrayx_ce0),
.we0 (arrayx_we0),
.address0 (arrayx_address0),
.din0 (arrayx_din0),
.dout0 (arrayx_dout0),
.ce1 (arrayx_ce1),
.we1 (arrayx_we1),
.address1 (arrayx_address1),
.din1 (arrayx_din1),
.dout1 (arrayx_dout1),
.ready (arrayx_ready),
.done (arrayx_done)
);
// Assignment between dut and arrayx
assign arrayx_address0 = x_address0;
assign arrayx_ce0 = x_ce0;
assign x_q0 = arrayx_dout0;
assign arrayx_we0 = 0;
assign arrayx_din0 = 0;
assign arrayx_we1 = 0;
assign arrayx_din1 = 0;
assign arrayx_ready= ready;
assign arrayx_done = 0;
//------------------------arrayy Instantiation--------------
// The input and output of arrayy
wire arrayy_ce0, arrayy_ce1;
wire arrayy_we0, arrayy_we1;
wire [6 : 0] arrayy_address0, arrayy_address1;
wire [31 : 0] arrayy_din0, arrayy_din1;
wire [31 : 0] arrayy_dout0, arrayy_dout1;
wire arrayy_ready;
wire arrayy_done;
mem_y uut_MEM_y(
.clk (clk_100m_i),
.rst (AESL_reset),
.ce0 (arrayy_ce0),
.we0 (arrayy_we0),
.address0 (arrayy_address0),
.din0 (arrayy_din0),
.dout0 (arrayy_dout0),
.ce1 (arrayy_ce1),
.we1 (arrayy_we1),
.address1 (arrayy_address1),
.din1 (arrayy_din1),
.dout1 (arrayy_dout1),
.ready (arrayy_ready),
.done (arrayy_done)
);
// Assignment between dut and arrayy
assign arrayy_address0 = y_address0;
assign arrayy_ce0 = y_ce0;
assign y_q0 = arrayy_dout0;
assign arrayy_we0 = y_we0;
assign arrayy_din0 = y_d0;
assign arrayy_we1 = 0;
assign arrayy_din1 = 0;
assign arrayy_ready= ready;
assign arrayy_done = interface_done;
always @(posedge clk_100m_i) begin
if(AESL_reset == 0) begin
if (ready == 1) begin
if (ready_cnt < AUTOTB_TRANSACTION_NUM) begin
ready_cnt = ready_cnt + 1;
end
end
end
end
wire all_finish = (done_cnt == AUTOTB_TRANSACTION_NUM);
// done_cnt
always @ (posedge clk_100m_i) begin
if (AESL_reset) begin
done_cnt <= 0;
end else begin
if (AESL_done == 1) begin
if (done_cnt < AUTOTB_TRANSACTION_NUM) begin
done_cnt <= done_cnt + 1;
end
end
end
end
reg end_x;
reg [31:0] size_x;
reg [31:0] size_x_backup;
reg end_y;
reg [31:0] size_y;
reg [31:0] size_y_backup;
reg [31:0] start_cnt;
always @(posedge clk_100m_i or posedge AESL_reset) begin
if (AESL_reset) begin
ce <= 1;
start_cnt <= 0;
end
else begin
if (AESL_ready) begin
start_cnt <= start_cnt + 1;
end
end
end
always @(posedge clk_100m_i)
begin
if(AESL_reset)
AESL_ready_delay = 0;
else
AESL_ready_delay = AESL_ready;
end
assign ready = AESL_ready_delay;
assign ready_wire = AESL_ready_delay;
always @(posedge clk_100m_i)
begin
if(AESL_reset)
begin
AESL_done_delay <= 0;
AESL_done_delay2 <= 0;
end
else begin
AESL_done_delay <= AESL_done;
AESL_done_delay2 <= AESL_done_delay;
end
end
always @(posedge clk_100m_i)
begin
if(AESL_reset)
interface_done = 0;
else begin
if(ready === 1 && ready_cnt > 0 && ready_cnt < AUTOTB_TRANSACTION_NUM)
interface_done = 1;
else if(AESL_done_delay === 1 && done_cnt == AUTOTB_TRANSACTION_NUM)
interface_done = 1;
else
interface_done = 0;
end
end
// progress and performance
reg [31:0] clk_cnt = 0;
reg AESL_ready_p1;
reg AESL_start_p1;
always @ (posedge clk_100m_i) begin
clk_cnt <= clk_cnt + 1;
AESL_ready_p1 <= AESL_ready;
AESL_start_p1 <= AESL_start;
end
endmodule
下面给出testbench.v文件
`timescale 1ns / 1ps
//
// Company: 东北电力大学
// Engineer: Yang Zheng
//
// Create Date: 2022/05/02 14:44:50
// Design Name:
// Module Name: testbench
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//
module testbench(
);
reg clk = 1'b0;
always #4 clk =~ clk;
reg start;
initial begin
start = 1'b0;
#200;
start = 1'b1;
end
Main uut_Main(
.clk_125_p(clk),
.clk_125_n(~clk),
.start(start)
);
endmodule
实验结果:
计算时间大约为101.5us,比公式法慢了5倍,且只能计算正定矩阵,而公式法可以计算任意非奇异矩阵,但VHLS资源占用很少! 我们将红色部分放大:
y_q0代表了求逆结果, y_address0代表了结果对应的ram地址。下面分析结果
3efc6d2b=0.4930203855037689;
3f078ec4=0.529522180557251;
bd59f582=-0.05321265012025833;
bf0576ea=-0.5213457345962524;
be939de3=-0.28831395506858826;
由前五个结果对应的浮点数可知,求逆结果正确。