前言
我是将CNN中的特征图(包括输入源图)和权重的卷积运算利用FPGA的逻辑资源实现,软件负责整体算法结构,包括算法的流程和对数据的操作,PS与PL之间的通信,对于不经常变并且频繁访问的变量(weight、bias、featuremap_size)、控制信号等通过AXI总线直接写入寄存器,因为PL可以直接取用,不会有延迟;对于图片这种较大的图像数据,通过DMA读写BRAM去加载特征图和读取卷积后的图像相比于寄存器方式更加高效;当PL完成卷积运算后,向PS发出中断,再循环。简单地说,就是将FF中4层循环的后两层交给PL去加速。
PL
整体框图
黄色:DMA master–>PS slave AXIHP
紫色:PS master --> DMA slave AXI Lite
青色:PS master–> Register slave AXI Lite
红色:DMA master --> BRAM Control AXI Lite
其余的一些模块是自动生成的,如时钟、复位逻辑、总线转换逻辑,
BRAM
BRAM设置为真双口,其中A端供PS读写、B端供PL读写,因为设置为8K(byte),所以前4K可以缓存特征图,对于最大的源图像 28*28*4 = 3136 byte
刚好够,后4K缓存卷积完成后的图像,所以相对偏移地址也是4K。
需要留意输出latency,因为RAM没有握手信号,所以锁存数据时是需要掌握它的输出时序
另外一点,手册上也说明了,在BRAM CONTROL控制下的BRAM,这个选项是写死的,但是,我仿真的时候发现很不合理的时序,这个后续再说
Regs
Regs是自己生成的带有AXI总线的寄存器模块,与之前不同,我这里分为PS可读可写和PS只读,RW属性寄存器可以用来传递5*5
权重和1
偏置,特征图尺寸和PL运算使能信号,RO是PL可写、PS只读,用途是可以将一些关键信号寄存,直接通过串口打印来观察PL运行状况,而不用抓信号,可以调试用。
Ports
除了DDR和Fixed IO连接到管脚,其他的中断、Bport、寄存器是要连接到PL的卷积模块。整体结构如下图
卷积模块
conv1模块源码
核心逻辑25个浮点乘法和13个浮点加法,1次25个px*weight
和5次sum(px_mult[i])+bias
,状态机根据握手信号跳转,目前没有流水化处理,运算模块针对1和0的特殊浮点稍微改动
module conv1(
input sys_clk,
input sys_rst_n,
input isEN,
output reg oDone,
input [31:0] bias_in,
input [31:0] weight_in0,
input [31:0] weight_in1,
input [31:0] weight_in2,
input [31:0] weight_in3,
input [31:0] weight_in4,
input [31:0] weight_in5,
input [31:0] weight_in6,
input [31:0] weight_in7,
input [31:0] weight_in8,
input [31:0] weight_in9,
input [31:0] weight_in10,
input [31:0] weight_in11,
input [31:0] weight_in12,
input [31:0] weight_in13,
input [31:0] weight_in14,
input [31:0] weight_in15,
input [31:0] weight_in16,
input [31:0] weight_in17,
input [31:0] weight_in18,
input [31:0] weight_in19,
input [31:0] weight_in20,
input [31:0] weight_in21,
input [31:0] weight_in22,
input [31:0] weight_in23,
input [31:0] weight_in24,
input [31:0] data_in0,
input [31:0] data_in1,
input [31:0] data_in2,
input [31:0] data_in3,
input [31:0] data_in4,
input [31:0] data_in5,
input [31:0] data_in6,
input [31:0] data_in7,
input [31:0] data_in8,
input [31:0] data_in9,
input [31:0] data_in10,
input [31:0] data_in11,
input [31:0] data_in12,
input [31:0] data_in13,
input [31:0] data_in14,
input [31:0] data_in15,
input [31:0] data_in16,
input [31:0] data_in17,
input [31:0] data_in18,
input [31:0] data_in19,
input [31:0] data_in20,
input [31:0] data_in21,
input [31:0] data_in22,
input [31:0] data_in23,
input [31:0] data_in24,
output reg [31:0] data_out
);
//inner port
reg [3:0] i;
reg mult_en;
reg [31:0] add0 ;
reg [31:0] added0 ;
reg [31:0] add1 ;
reg [31:0] added1 ;
reg [31:0] add2 ;
reg [31:0] added2 ;
reg [31:0] add3 ;
reg [31:0] added3 ;
reg [31:0] add4 ;
reg [31:0] added4 ;
reg [31:0] add5 ;
reg [31:0] added5 ;
reg [31:0] add6 ;
reg [31:0] added6 ;
reg [31:0] add7 ;
reg [31:0] added7 ;
reg [31:0] add8 ;
reg [31:0] added8 ;
reg [31:0] add9 ;
reg [31:0] added9 ;
reg [31:0] add10 ;
reg [31:0] added10 ;
reg [31:0] add11 ;
reg [31:0] added11 ;
reg [31:0] add12 ;
reg [31:0] added12 ;
reg add_en0 ;
reg add_en1 ;
reg add_en2 ;
reg add_en3 ;
reg add_en4 ;
reg add_en5 ;
reg add_en6 ;
reg add_en7 ;
reg add_en8 ;
reg add_en9 ;
reg add_en10 ;
reg add_en11 ;
reg add_en12 ;
wire [31:0] A0 ;
wire [31:0] A1 ;
wire [31:0] A2 ;
wire [31:0] A3 ;
wire [31:0] A4 ;
wire [31:0] A5 ;
wire [31:0] A6 ;
wire [31:0] A7 ;
wire [31:0] A8 ;
wire [31:0] A9 ;
wire [31:0] A10 ;
wire [31:0] A11 ;
wire [31:0] A12 ;
wire [31:0] A13 ;
wire [31:0] A14 ;
wire [31:0] A15 ;
wire [31:0] A16 ;
wire [31:0] A17 ;
wire [31:0] A18 ;
wire [31:0] A19 ;
wire [31:0] A20 ;
wire [31:0] A21 ;
wire [31:0] A22 ;
wire [31:0] A23 ;
wire [31:0] A24 ;
wire [31:0] B0 ;
wire [31:0] B1 ;
wire [31:0] B2 ;
wire [31:0] B3 ;
wire [31:0] B4 ;
wire [31:0] B5 ;
wire [31:0] B6 ;
wire [31:0] B7 ;
wire [31:0] B8 ;
wire [31:0] B9 ;
wire [31:0] B10 ;
wire [31:0] B11 ;
wire [31:0] B12 ;
wire [31:0] B13 ;
wire [31:0] B14 ;
wire [31:0] B15 ;
wire [31:0] B16 ;
wire [31:0] B17 ;
wire [31:0] B18 ;
wire [31:0] B19 ;
wire [31:0] B20 ;
wire [31:0] B21 ;
wire [31:0] B22 ;
wire [31:0] B23 ;
wire [31:0] B24 ;
wire mult_done0;
wire mult_done1;
wire mult_done2;
wire mult_done3;
wire mult_done4;
wire mult_done5;
wire mult_done6;
wire mult_done7;
wire mult_done8;
wire mult_done9;
wire mult_done10;
wire mult_done11;
wire mult_done12;
wire mult_done13;
wire mult_done14;
wire mult_done15;
wire mult_done16;
wire mult_done17;
wire mult_done18;
wire mult_done19;
wire mult_done20;
wire mult_done21;
wire mult_done22;
wire mult_done23;
wire mult_done24;
wire [31:0] px_mult0;
wire [31:0] px_mult1;
wire [31:0] px_mult2;
wire [31:0] px_mult3;
wire [31:0] px_mult4;
wire [31:0] px_mult5;
wire [31:0] px_mult6;
wire [31:0] px_mult7;
wire [31:0] px_mult8;
wire [31:0] px_mult9;
wire [31:0] px_mult10;
wire [31:0] px_mult11;
wire [31:0] px_mult12;
wire [31:0] px_mult13;
wire [31:0] px_mult14;
wire [31:0] px_mult15;
wire [31:0] px_mult16;
wire [31:0] px_mult17;
wire [31:0] px_mult18;
wire [31:0] px_mult19;
wire [31:0] px_mult20;
wire [31:0] px_mult21;
wire [31:0] px_mult22;
wire [31:0] px_mult23;
wire [31:0] px_mult24;
wire [31:0] px_sum0;
wire [31:0] px_sum1;
wire [31:0] px_sum2;
wire [31:0] px_sum3;
wire [31:0] px_sum4;
wire [31:0] px_sum5;
wire [31:0] px_sum6;
wire [31:0] px_sum7;
wire [31:0] px_sum8;
wire [31:0] px_sum9;
wire [31:0] px_sum10;
wire [31:0] px_sum11;
wire [31:0] px_sum12;
wire add_done0;
wire add_done1;
wire add_done2;
wire add_done3;
wire add_done4;
wire add_done5;
wire add_done6;
wire add_done7;
wire add_done8;
wire add_done9;
wire add_done10;
wire add_done11;
wire add_done12;
///core function
always@(posedge sys_clk or negedge sys_rst_n)begin
if(!sys_rst_n)begin
i <= 0;
mult_en <= 0;
add0 <= 0 ;
added0 <= 0 ;
add1 <= 0 ;
added1 <= 0 ;
add2 <= 0 ;
added2 <= 0 ;
add3 <= 0 ;
added3 <= 0 ;
add4 <= 0 ;
added4 <= 0 ;
add5 <= 0 ;
added5 <= 0 ;
add6 <= 0 ;
added6 <= 0 ;
add7 <= 0 ;
added7 <= 0 ;
add8 <= 0 ;
added8 <= 0 ;
add9 <= 0 ;
added9 <= 0 ;
add10 <= 0 ;
added10 <= 0 ;
add11 <= 0 ;
added11 <= 0 ;
add12 <= 0 ;
added12 <= 0 ;
add_en0 <= 0;
add_en1 <= 0;
add_en2 <= 0;
add_en3 <= 0;
add_en4 <= 0;
add_en5 <= 0;
add_en6 <= 0;
add_en7 <= 0;
add_en8 <= 0;
add_en9 <= 0;
add_en10 <= 0;
add_en11 <= 0;
add_en12 <= 0;
oDone <= 0;
end
else if(isEN)
case(i)
0:begin
/*
if(isEN)
i <= i+1;
else
i <= i;
*/
i <= i+1;
end
// mult operation
1:begin
if(mult_done0)begin
i <= i+1;
mult_en <= 0;
add0 <= px_mult0;
added0 <= px_mult1;
add1 <= px_mult2;
added1 <= px_mult3;
add2 <= px_mult4;
added2 <= px_mult5;
add3 <= px_mult6;
added3 <= px_mult7;
add4 <= px_mult8;
added4 <= px_mult9;
add5 <= px_mult10;
added5 <= px_mult11;
add6 <= px_mult12;
added6 <= px_mult13;
add7 <= px_mult14;
added7 <= px_mult15;
add8 <= px_mult16;
added8 <= px_mult17;
add9 <= px_mult18;
added9 <= px_mult19;
add10 <= px_mult20;
added10 <= px_mult21;
add11 <= px_mult22;
added11 <= px_mult23;
add12 <= px_mult24;
added12 <= bias_in;
end
else begin
mult_en <= 1;
i <= i;
end
end
// add operation 1
2:begin
if(add_done12)begin
add_en0 <= 0;
add_en1 <= 0;
add_en2 <= 0;
add_en3 <= 0;
add_en4 <= 0;
add_en5 <= 0;
add_en6 <= 0;
add_en7 <= 0;
add_en8 <= 0;
add_en9 <= 0;
add_en10 <= 0;
add_en11 <= 0;
add_en12 <= 0;
i <= i+1;
add0 <= px_sum0;
added0 <= px_sum1;
add1 <= px_sum2;
added1 <= px_sum3;
add2 <= px_sum4;
added2 <= px_sum5;
add3 <= px_sum6;
added3 <= px_sum7;
add4 <= px_sum8;
added4 <= px_sum9;
add5 <= px_sum10;
added5 <= px_sum11;
end
else begin
add_en0 <= 1;
add_en1 <= 1;
add_en2 <= 1;
add_en3 <= 1;
add_en4 <= 1;
add_en5 <= 1;
add_en6 <= 1;
add_en7 <= 1;
add_en8 <= 1;
add_en9 <= 1;
add_en10 <= 1;
add_en11 <= 1;
add_en12 <= 1;
i <= i;
end
end
// add operation 2
3:begin
if(add_done5)begin
add_en0 <= 0;
add_en1 <= 0;
add_en2 <= 0;
add_en3 <= 0;
add_en4 <= 0;
add_en5 <= 0;
i <= i+1;
add0 <= px_sum0;
added0 <= px_sum1;
add1 <= px_sum2;
added1 <= px_sum3;
add2 <= px_sum4;
added2 <= px_sum5;
end
else begin
add_en0 <= 1;
add_en1 <= 1;
add_en2 <= 1;
add_en3 <= 1;
add_en4 <= 1;
add_en5 <= 1;
i <= i;
end
end
//add operation 3
4:begin
if(add_done2)begin
add_en0 <= 0;
add_en1 <= 0;
add_en2 <= 0;
i <= i+1;
add0 <= px_sum0;
added0 <= px_sum1;
add1 <= px_sum2;
added1 <= px_sum12;
end
else begin
add_en0 <= 1;
add_en1 <= 1;
add_en2 <= 1;
i <= i;
end
end
//add operation 4
5:begin
if(add_done1)begin
add_en0 <= 0;
add_en1 <= 0;
i <= i+1;
add0 <= px_sum0;
added0 <= px_sum1;
end
else begin
add_en0 <= 1;
add_en1 <= 1;
i <= i;
end
end
//
6:begin
if(add_done0)begin
data_out <= px_sum0;
add_en0 <= 0;
i <= i+1;
end
else begin
add_en0 <= 1;
i <= i;
end
end
7:begin
i <= i+1;
oDone <= 1;
end
8:begin
i <= 0;
oDone <= 0;
end
endcase
end
assign A0 = weight_in0 ;
assign A1 = weight_in1 ;
assign A2 = weight_in2 ;
assign A3 = weight_in3 ;
assign A4 = weight_in4 ;
assign A5 = weight_in5 ;
assign A6 = weight_in6 ;
assign A7 = weight_in7 ;
assign A8 = weight_in8 ;
assign A9 = weight_in9 ;
assign A10 = weight_in10 ;
assign A11 = weight_in11 ;
assign A12 = weight_in12 ;
assign A13 = weight_in13 ;
assign A14 = weight_in14 ;
assign A15 = weight_in15 ;
assign A16 = weight_in16 ;
assign A17 = weight_in17 ;
assign A18 = weight_in18 ;
assign A19 = weight_in19 ;
assign A20 = weight_in20 ;
assign A21 = weight_in21 ;
assign A22 = weight_in22 ;
assign A23 = weight_in23 ;
assign A24 = weight_in24 ;
assign B0 = data_in0 ;
assign B1 = data_in1 ;
assign B2 = data_in2 ;
assign B3 = data_in3 ;
assign B4 = data_in4 ;
assign B5 = data_in5 ;
assign B6 = data_in6 ;
assign B7 = data_in7 ;
assign B8 = data_in8 ;
assign B9 = data_in9 ;
assign B10 = data_in10 ;
assign B11 = data_in11 ;
assign B12 = data_in12 ;
assign B13 = data_in13 ;
assign B14 = data_in14 ;
assign B15 = data_in15 ;
assign B16 = data_in16 ;
assign B17 = data_in17 ;
assign B18 = data_in18 ;
assign B19 = data_in19 ;
assign B20 = data_in20 ;
assign B21 = data_in21 ;
assign B22 = data_in22 ;
assign B23 = data_in23 ;
assign B24 = data_in24 ;
///
//32floatmult
float_multi_module float_multi_module_inst0
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A0),
.B (B0),
.Result (px_mult0),
.Start_Sig (mult_en),
.Done_Sig (mult_done0), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst1
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A1),
.B (B1),
.Result (px_mult1),
.Start_Sig (mult_en),
.Done_Sig (mult_done1), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst2
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A2),
.B (B2),
.Result (px_mult2),
.Start_Sig (mult_en),
.Done_Sig (mult_done2), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst3
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A3),
.B (B3),
.Result (px_mult3),
.Start_Sig (mult_en),
.Done_Sig (mult_done3), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst4
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A4),
.B (B4),
.Result (px_mult4),
.Start_Sig (mult_en),
.Done_Sig (mult_done4), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
//
float_multi_module float_multi_module_inst5
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A5),
.B (B5),
.Result (px_mult5),
.Start_Sig (mult_en),
.Done_Sig (mult_done5), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst6
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A6),
.B (B6),
.Result (px_mult6),
.Start_Sig (mult_en),
.Done_Sig (mult_done6), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst7
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A7),
.B (B7),
.Result (px_mult7),
.Start_Sig (mult_en),
.Done_Sig (mult_done7), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst8
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A8),
.B (B8),
.Result (px_mult8),
.Start_Sig (mult_en),
.Done_Sig (mult_done8), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst9
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A9),
.B (B9),
.Result (px_mult9),
.Start_Sig (mult_en),
.Done_Sig (mult_done9), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
//
float_multi_module float_multi_module_inst10
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A10),
.B (B10),
.Result (px_mult10),
.Start_Sig (mult_en),
.Done_Sig (mult_done10), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst11
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A11),
.B (B11),
.Result (px_mult11),
.Start_Sig (mult_en),
.Done_Sig (mult_done11), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst12
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A12),
.B (B12),
.Result (px_mult12),
.Start_Sig (mult_en),
.Done_Sig (mult_done12), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst13
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A13),
.B (B13),
.Result (px_mult13),
.Start_Sig (mult_en),
.Done_Sig (mult_done13), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst14
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A14),
.B (B14),
.Result (px_mult14),
.Start_Sig (mult_en),
.Done_Sig (mult_done14), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
//
float_multi_module float_multi_module_inst15
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A15),
.B (B15),
.Result (px_mult15),
.Start_Sig (mult_en),
.Done_Sig (mult_done15), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst16
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A16),
.B (B16),
.Result (px_mult16),
.Start_Sig (mult_en),
.Done_Sig (mult_done16), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst17
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A17),
.B (B17),
.Result (px_mult17),
.Start_Sig (mult_en),
.Done_Sig (mult_done17), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst18
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A18),
.B (B18),
.Result (px_mult18),
.Start_Sig (mult_en),
.Done_Sig (mult_done18), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst19
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A19),
.B (B19),
.Result (px_mult19),
.Start_Sig (mult_en),
.Done_Sig (mult_done19), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
//
float_multi_module float_multi_module_inst20
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A20),
.B (B20),
.Result (px_mult20),
.Start_Sig (mult_en),
.Done_Sig (mult_done20), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst21
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A21),
.B (B21),
.Result (px_mult21),
.Start_Sig (mult_en),
.Done_Sig (mult_done21), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst22
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A22),
.B (B22),
.Result (px_mult22),
.Start_Sig (mult_en),
.Done_Sig (mult_done22), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst23
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A23),
.B (B23),
.Result (px_mult23),
.Start_Sig (mult_en),
.Done_Sig (mult_done23), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
float_multi_module float_multi_module_inst24
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (A24),
.B (B24),
.Result (px_mult24),
.Start_Sig (mult_en),
.Done_Sig (mult_done24), //{ isOver, isUnder, isZero, isDone }
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_rExp (),
.SQ_BDiff ()
);
///
//32floatadd
float_add_module float_add_module_inst0
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added0),
.B (add0),
.Result (px_sum0),
.Start_Sig (add_en0),
.Done_Sig (add_done0),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst1
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added1),
.B (add1),
.Result (px_sum1),
.Start_Sig (add_en1),
.Done_Sig (add_done1),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst2
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added2),
.B (add2),
.Result (px_sum2),
.Start_Sig (add_en2),
.Done_Sig (add_done2),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst3
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added3),
.B (add3),
.Result (px_sum3),
.Start_Sig (add_en3),
.Done_Sig (add_done3),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst4
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added4),
.B (add4),
.Result (px_sum4),
.Start_Sig (add_en4),
.Done_Sig (add_done4),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst5
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added5),
.B (add5),
.Result (px_sum5),
.Start_Sig (add_en5),
.Done_Sig (add_done5),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst6
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added6),
.B (add6),
.Result (px_sum6),
.Start_Sig (add_en6),
.Done_Sig (add_done6),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst7
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added7),
.B (add7),
.Result (px_sum7),
.Start_Sig (add_en7),
.Done_Sig (add_done7),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst8
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added8),
.B (add8),
.Result (px_sum8),
.Start_Sig (add_en8),
.Done_Sig (add_done8),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst9
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added9),
.B (add9),
.Result (px_sum9),
.Start_Sig (add_en9),
.Done_Sig (add_done9),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst10
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added10),
.B (add10),
.Result (px_sum10),
.Start_Sig (add_en10),
.Done_Sig (add_done10),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst11
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added11),
.B (add11),
.Result (px_sum11),
.Start_Sig (add_en11),
.Done_Sig (add_done11),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
float_add_module float_add_module_inst12
(
.CLK (sys_clk),
.RSTn (sys_rst_n),
.A (added12),
.B (add12),
.Result (px_sum12),
.Start_Sig (add_en12),
.Done_Sig (add_done12),
.SQ_rA (),
.SQ_rB (),
.SQ_Temp (),
.SQ_TempA (),
.SQ_TempB (),
.SQ_rExp (),
.SQ_rExpDiff ()
);
//
endmodule
layer1模块源码
核心操作是通过B port对BRAM进行读操作和写操作,中间需要缓存去锁存待运算数据、等到conv1模块运算完成后再去写入偏移后的地址,然后控制地址跳转至下一个卷积子图像起始地址。
module layer1(
input sys_clk,
input sys_rst_n,
output PL_interrupt, //oDone
input [31:0] reg0 ,
input [31:0] reg1 ,
input [31:0] reg2 ,
input [31:0] reg3 ,
input [31:0] reg4 ,
input [31:0] reg5 ,
input [31:0] reg6 ,
input [31:0] reg7 ,
input [31:0] reg8 ,
input [31:0] reg9 ,
input [31:0] reg10 ,
input [31:0] reg11 ,
input [31:0] reg12 ,
input [31:0] reg13 ,
input [31:0] reg14 ,
input [31:0] reg15 ,
input [31:0] reg16 ,
input [31:0] reg17 ,
input [31:0] reg18 ,
input [31:0] reg19 ,
input [31:0] reg20 ,
input [31:0] reg21 ,
input [31:0] reg22 ,
input [31:0] reg23 ,
input [31:0] reg24 , //5*5 filter
input [31:0] reg25 , //bias
input [31:0] reg26 , //bit[7:0] size
input [31:0] reg27 , //bit[0] isEN
output reg [31:0] reg28 , //for debug
output reg [31:0] reg29 ,
output reg [31:0] reg30 ,
output reg [31:0] reg31 ,
output reg [31:0] ram_addr, //addrb
output reg [31:0] wr_out , //dinb
input [31:0] rd_in , //doutb
output reg enb , //enb
output reg [3:0] web //web
);
inner port
parameter DES_RAM_BASE = 32'h400;//ram desti addr 32'hC0001000
reg conv_en;
wire conv_done;
reg [5:0] i; //33
reg rDone;
//register in
reg [31:0] weight_in0 ;
reg [31:0] weight_in1 ;
reg [31:0] weight_in2 ;
reg [31:0] weight_in3 ;
reg [31:0] weight_in4 ;
reg [31:0] weight_in5 ;
reg [31:0] weight_in6 ;
reg [31:0] weight_in7 ;
reg [31:0] weight_in8 ;
reg [31:0] weight_in9 ;
reg [31:0] weight_in10 ;
reg [31:0] weight_in11 ;
reg [31:0] weight_in12 ;
reg [31:0] weight_in13 ;
reg [31:0] weight_in14 ;
reg [31:0] weight_in15 ;
reg [31:0] weight_in16 ;
reg [31:0] weight_in17 ;
reg [31:0] weight_in18 ;
reg [31:0] weight_in19 ;
reg [31:0] weight_in20 ;
reg [31:0] weight_in21 ;
reg [31:0] weight_in22 ;
reg [31:0] weight_in23 ;
reg [31:0] weight_in24 ;
reg [31:0] bias_in ;
//reg [31:0] dest_ddr ; //
reg [7:0] fmap_size ;
reg isEN ;
//
//rd ram
reg [31:0] fmap_size_Sy ;
reg [31:0] fmap_size_2 ;
reg [31:0] fmap_size_3 ;
reg [31:0] fmap_size_4 ;
reg [31:0] Sx_1 ;
reg [31:0] Sx_2 ;
reg [31:0] Sx_3 ;
reg [31:0] Sx_4 ;
reg [31:0] fmap_size_Sy_1 ;
reg [31:0] fmap_size_Sy_2 ;
reg [31:0] fmap_size_Sy_3 ;
reg [31:0] fmap_size_Sy_4 ;
reg [31:0] addr0 ;
reg [31:0] addr1 ;
reg [31:0] addr2 ;
reg [31:0] addr3 ;
reg [31:0] addr4 ;
reg [31:0] addr5 ;
reg [31:0] addr6 ;
reg [31:0] addr7 ;
reg [31:0] addr8 ;
reg [31:0] addr9 ;
reg [31:0] addr10 ;
reg [31:0] addr11 ;
reg [31:0] addr12 ;
reg [31:0] addr13 ;
reg [31:0] addr14 ;
reg [31:0] addr15 ;
reg [31:0] addr16 ;
reg [31:0] addr17 ;
reg [31:0] addr18 ;
reg [31:0] addr19 ;
reg [31:0] addr20 ;
reg [31:0] addr21 ;
reg [31:0] addr22 ;
reg [31:0] addr23 ;
reg [31:0] addr24 ;
//px data
reg [31:0] data_in0 ;
reg [31:0] data_in1 ;
reg [31:0] data_in2 ;
reg [31:0] data_in3 ;
reg [31:0] data_in4 ;
reg [31:0] data_in5 ;
reg [31:0] data_in6 ;
reg [31:0] data_in7 ;
reg [31:0] data_in8 ;
reg [31:0] data_in9 ;
reg [31:0] data_in10 ;
reg [31:0] data_in11 ;
reg [31:0] data_in12 ;
reg [31:0] data_in13 ;
reg [31:0] data_in14 ;
reg [31:0] data_in15 ;
reg [31:0] data_in16 ;
reg [31:0] data_in17 ;
reg [31:0] data_in18 ;
reg [31:0] data_in19 ;
reg [31:0] data_in20 ;
reg [31:0] data_in21 ;
reg [31:0] data_in22 ;
reg [31:0] data_in23 ;
reg [31:0] data_in24 ;
reg [31:0] Sx;
reg [31:0] Sy;
reg [31:0] wr_addr;
wire [31:0] data_out;
//core operation
always@(posedge sys_clk or negedge sys_rst_n)begin
if(!sys_rst_n)begin
i <= 0;
rDone <= 0;
//register in
weight_in0 <= 0;
weight_in1 <= 0;
weight_in2 <= 0;
weight_in3 <= 0;
weight_in4 <= 0;
weight_in5 <= 0;
weight_in6 <= 0;
weight_in7 <= 0;
weight_in8 <= 0;
weight_in9 <= 0;
weight_in10 <= 0;
weight_in11 <= 0;
weight_in12 <= 0;
weight_in13 <= 0;
weight_in14 <= 0;
weight_in15 <= 0;
weight_in16 <= 0;
weight_in17 <= 0;
weight_in18 <= 0;
weight_in19 <= 0;
weight_in20 <= 0;
weight_in21 <= 0;
weight_in22 <= 0;
weight_in23 <= 0;
weight_in24 <= 0;
bias_in <= 0;
//dest_ddr <= 0;
fmap_size <= 0;
isEN <= 0;
//rd ram
fmap_size_Sy <= 0;
fmap_size_2 <= 0;
fmap_size_3 <= 0;
fmap_size_4 <= 0;
Sx_1 <= 0;
Sx_2 <= 0;
Sx_3 <= 0;
Sx_4 <= 0;
fmap_size_Sy_1 <= 0;
fmap_size_Sy_2 <= 0;
fmap_size_Sy_3 <= 0;
fmap_size_Sy_4 <= 0;
addr0 <= 0;
addr1 <= 0;
addr2 <= 0;
addr3 <= 0;
addr4 <= 0;
addr5 <= 0;
addr6 <= 0;
addr7 <= 0;
addr8 <= 0;
addr9 <= 0;
addr10 <= 0;
addr11 <= 0;
addr12 <= 0;
addr13 <= 0;
addr14 <= 0;
addr15 <= 0;
addr16 <= 0;
addr17 <= 0;
addr18 <= 0;
addr19 <= 0;
addr20 <= 0;
addr21 <= 0;
addr22 <= 0;
addr23 <= 0;
addr24 <= 0;
//px data
data_in0 <= 0;
data_in1 <= 0;
data_in2 <= 0;
data_in3 <= 0;
data_in4 <= 0;
data_in5 <= 0;
data_in6 <= 0;
data_in7 <= 0;
data_in8 <= 0;
data_in9 <= 0;
data_in10 <= 0;
data_in11 <= 0;
data_in12 <= 0;
data_in13 <= 0;
data_in14 <= 0;
data_in15 <= 0;
data_in16 <= 0;
data_in17 <= 0;
data_in18 <= 0;
data_in19 <= 0;
data_in20 <= 0;
data_in21 <= 0;
data_in22 <= 0;
data_in23 <= 0;
data_in24 <= 0;
//ram control
ram_addr <= 0;
enb <= 0;
web <= 0;
Sx <= 0;
Sy <= 0;
wr_addr <= DES_RAM_BASE;
end
else
case(i)
//update weights bias
0:begin
weight_in0 <= reg0 ;
weight_in1 <= reg1 ;
weight_in2 <= reg2 ;
weight_in3 <= reg3 ;
weight_in4 <= reg4 ;
weight_in5 <= reg5 ;
weight_in6 <= reg6 ;
weight_in7 <= reg7 ;
weight_in8 <= reg8 ;
weight_in9 <= reg9 ;
weight_in10 <= reg10 ;
weight_in11 <= reg11 ;
weight_in12 <= reg12 ;
weight_in13 <= reg13 ;
weight_in14 <= reg14 ;
weight_in15 <= reg15 ;
weight_in16 <= reg16 ;
weight_in17 <= reg17 ;
weight_in18 <= reg18 ;
weight_in19 <= reg19 ;
weight_in20 <= reg20 ;
weight_in21 <= reg21 ;
weight_in22 <= reg22 ;
weight_in23 <= reg23 ;
weight_in24 <= reg24 ;
bias_in <= reg25 ;
//dest_ddr <= reg26 ;
fmap_size <= reg26[7:0];
isEN <= reg27[0];
if(isEN)
i <= i+1;
else
i <= i;
end
//update ram addr
1:begin
fmap_size_Sy <= fmap_size*Sy;
fmap_size_2 <= fmap_size*2;
fmap_size_3 <= fmap_size*3;
fmap_size_4 <= fmap_size*4;
Sx_1 <= Sx+1;
Sx_2 <= Sx+2;
Sx_3 <= Sx+3;
Sx_4 <= Sx+4;
i <= i+1;
end
2:begin
fmap_size_Sy_1 <= fmap_size_Sy+fmap_size;
fmap_size_Sy_2 <= fmap_size_Sy+fmap_size_2;
fmap_size_Sy_3 <= fmap_size_Sy+fmap_size_3;
fmap_size_Sy_4 <= fmap_size_Sy+fmap_size_4;
i <= i+1;
end
3:begin
addr0 <= Sx+fmap_size_Sy;
addr1 <= Sx_1+fmap_size_Sy;
addr2 <= Sx_2+fmap_size_Sy;
addr3 <= Sx_3+fmap_size_Sy;
addr4 <= Sx_4+fmap_size_Sy;
addr5 <= Sx+fmap_size_Sy_1;
addr6 <= Sx_1+fmap_size_Sy_1;
addr7 <= Sx_2+fmap_size_Sy_1;
addr8 <= Sx_3+fmap_size_Sy_1;
addr9 <= Sx_4+fmap_size_Sy_1;
addr10 <= Sx+fmap_size_Sy_2;
addr11 <= Sx_1+fmap_size_Sy_2;
addr12 <= Sx_2+fmap_size_Sy_2;
addr13 <= Sx_3+fmap_size_Sy_2;
addr14 <= Sx_4+fmap_size_Sy_2;
addr15 <= Sx+fmap_size_Sy_3;
addr16 <= Sx_1+fmap_size_Sy_3;
addr17 <= Sx_2+fmap_size_Sy_3;
addr18 <= Sx_3+fmap_size_Sy_3;
addr19 <= Sx_4+fmap_size_Sy_3;
addr20 <= Sx+fmap_size_Sy_4;
addr21 <= Sx_1+fmap_size_Sy_4;
addr22 <= Sx_2+fmap_size_Sy_4;
addr23 <= Sx_3+fmap_size_Sy_4;
addr24 <= Sx_4+fmap_size_Sy_4;
i <= i+1;
end
//read 25 px from ram
4:begin
enb <= 1;
ram_addr <= addr0;
i <= i+1;
end
5:begin //0
//data_in0 <= rd_in;
ram_addr <= addr1;
i <= i+1;
end
6:begin //1
data_in0 <= rd_in;
ram_addr <= addr2;
i <= i+1;
end
7:begin //2
data_in1 <= rd_in;
ram_addr <= addr3;
i <= i+1;
end
8:begin //3
data_in2 <= rd_in;
ram_addr <= addr4;
i <= i+1;
end
9:begin //4
data_in3 <= rd_in;
ram_addr <= addr5;
i <= i+1;
end
10:begin //5
data_in4 <= rd_in;
ram_addr <= addr6;
i <= i+1;
end
11:begin //6
data_in5 <= rd_in;
ram_addr <= addr7;
i <= i+1;
end
12:begin //7
data_in6 <= rd_in;
ram_addr <= addr8;
i <= i+1;
end
13:begin //8
data_in7 <= rd_in;
ram_addr <= addr9;
i <= i+1;
end
14:begin //9
data_in8 <= rd_in;
ram_addr <= addr10;
i <= i+1;
end
15:begin //10
data_in9 <= rd_in;
ram_addr <= addr11;
i <= i+1;
end
16:begin //11
data_in10 <= rd_in;
ram_addr <= addr12;
i <= i+1;
end
17:begin //12
data_in11 <= rd_in;
ram_addr <= addr13;
i <= i+1;
end
18:begin //13
data_in12 <= rd_in;
ram_addr <= addr14;
i <= i+1;
end
19:begin //14
data_in13 <= rd_in;
ram_addr <= addr15;
i <= i+1;
end
20:begin //15
data_in14 <= rd_in;
ram_addr <= addr16;
i <= i+1;
end
21:begin //16
data_in15 <= rd_in;
ram_addr <= addr17;
i <= i+1;
end
22:begin //17
data_in16 <= rd_in;
ram_addr <= addr18;
i <= i+1;
end
23:begin //18
data_in17 <= rd_in;
ram_addr <= addr19;
i <= i+1;
end
24:begin //19
data_in18 <= rd_in;
ram_addr <= addr20;
i <= i+1;
end
25:begin //20
data_in19 <= rd_in;
ram_addr <= addr21;
i <= i+1;
end
26:begin //21
data_in20 <= rd_in;
ram_addr <= addr22;
i <= i+1;
end
27:begin //22
data_in21 <= rd_in;
ram_addr <= addr23;
i <= i+1;
end
28:begin //23
data_in22 <= rd_in;
ram_addr <= addr24;
i <= i+1;
end
29:begin //24
data_in23 <= rd_in;
enb <= 0;
ram_addr <= 0;
i <= i+1;
end
30:begin //25
data_in24 <= rd_in;
i <= i+1;
end
//caculate en
31:begin
if(conv_done)begin
conv_en <= 0;
i <= i+1;
web <= 4'b1111;
enb <= 1;
ram_addr <= wr_addr;
wr_out <= data_out;
wr_addr <= wr_addr+1;
end
else begin
conv_en <= 1;
i <= i;
web <= 0;
enb <= 0;
wr_addr <= wr_addr;
end
end
//data control
32:begin
web <= 0;
enb <= 0;
if((Sy == fmap_size-5)&&(Sx == fmap_size-5))begin
i <= i+1;
Sx <= 0;
Sy <= 0;
wr_addr <= DES_RAM_BASE;
end
else if(Sx == fmap_size-5)begin
Sy <= Sy+1;
Sx <= 0;
i <= 1;
end
else begin
Sx <= Sx+1;
i <= 1;
end
end
33:begin //interrupt
rDone <= 1;
i <= i+1;
end
34:begin
rDone <= 0;
i <= 0;
end
endcase
end
//
conv1 conv1_inst(
.sys_clk (sys_clk ),
.sys_rst_n (sys_rst_n ),
.isEN (conv_en ),
.oDone (conv_done ),
.bias_in (bias_in ),
.weight_in0 (weight_in0 ),
.weight_in1 (weight_in1 ),
.weight_in2 (weight_in2 ),
.weight_in3 (weight_in3 ),
.weight_in4 (weight_in4 ),
.weight_in5 (weight_in5 ),
.weight_in6 (weight_in6 ),
.weight_in7 (weight_in7 ),
.weight_in8 (weight_in8 ),
.weight_in9 (weight_in9 ),
.weight_in10 (weight_in10),
.weight_in11 (weight_in11),
.weight_in12 (weight_in12),
.weight_in13 (weight_in13),
.weight_in14 (weight_in14),
.weight_in15 (weight_in15),
.weight_in16 (weight_in16),
.weight_in17 (weight_in17),
.weight_in18 (weight_in18),
.weight_in19 (weight_in19),
.weight_in20 (weight_in20),
.weight_in21 (weight_in21),
.weight_in22 (weight_in22),
.weight_in23 (weight_in23),
.weight_in24 (weight_in24),
.data_in0 (data_in0 ),
.data_in1 (data_in1 ),
.data_in2 (data_in2 ),
.data_in3 (data_in3 ),
.data_in4 (data_in4 ),
.data_in5 (data_in5 ),
.data_in6 (data_in6 ),
.data_in7 (data_in7 ),
.data_in8 (data_in8 ),
.data_in9 (data_in9 ),
.data_in10 (data_in10 ),
.data_in11 (data_in11 ),
.data_in12 (data_in12 ),
.data_in13 (data_in13 ),
.data_in14 (data_in14 ),
.data_in15 (data_in15 ),
.data_in16 (data_in16 ),
.data_in17 (data_in17 ),
.data_in18 (data_in18 ),
.data_in19 (data_in19 ),
.data_in20 (data_in20 ),
.data_in21 (data_in21 ),
.data_in22 (data_in22 ),
.data_in23 (data_in23 ),
.data_in24 (data_in24 ),
.data_out (data_out )
);
/
assign PL_interrupt = rDone;
endmodule
激励源码
核心逻辑是控制权重和偏置均为2(40 00 00 00),然后输入不同的源图像,
浮点格式不方便分析,所以刚开始将地址和输入数据设置为递增数,输入36,即6*6,卷积核维数为5,no padding;值得说明的是BRAM ip除了这个地方不一样,其他均一致,因为之前仿真出现问题,并且定位到是这个选项的原因,我大概猜到可能跟AXI除法器的坑类似,但是还需要验证才行。
`timescale 1ns / 1ns
module test_tb();
reg CLK;
reg RST_N;
initial begin
RST_N = 0; #20 RST_N = 1;
CLK = 0; forever #5 CLK =~CLK;
end
reg ena;
reg [3:0] wea;
reg [31:0] addra;
reg [31:0] dina;
wire [31:0] douta;
wire enb;
wire [3:0] web;
wire [31:0] addrb;
wire [31:0] dinb;
wire [31:0] doutb;
blk_mem_gen_0 your_instance_name (
.clka(CLK), // input wire clka
.ena(ena), // input wire ena
.wea(wea), // input wire [3 : 0] wea
.addra(addra[10:0]), // input wire [31 : 0] addra
.dina(dina), // input wire [31 : 0] dina
.douta(douta), // output wire [31 : 0] douta
.clkb(CLK), // input wire clkb
.enb(enb), // input wire enb
.web(web), // input wire [3 : 0] web
.addrb(addrb[10:0]), // input wire [31 : 0] addrb
.dinb(dinb), // input wire [31 : 0] dinb
.doutb(doutb) // output wire [31 : 0] doutb
);
wire PL_interrup;
reg [31:0] reg0 ;
reg [31:0] reg1 ;
reg [31:0] reg2 ;
reg [31:0] reg3 ;
reg [31:0] reg4 ;
reg [31:0] reg5 ;
reg [31:0] reg6 ;
reg [31:0] reg7 ;
reg [31:0] reg8 ;
reg [31:0] reg9 ;
reg [31:0] reg10 ;
reg [31:0] reg11 ;
reg [31:0] reg12 ;
reg [31:0] reg13 ;
reg [31:0] reg14 ;
reg [31:0] reg15 ;
reg [31:0] reg16 ;
reg [31:0] reg17 ;
reg [31:0] reg18 ;
reg [31:0] reg19 ;
reg [31:0] reg20 ;
reg [31:0] reg21 ;
reg [31:0] reg22 ;
reg [31:0] reg23 ;
reg [31:0] reg24 ;
reg [31:0] reg25 ;
reg [31:0] reg26 ;
reg [31:0] reg27 ;
wire [31:0] reg28 ;
wire [31:0] reg29 ;
wire [31:0] reg30 ;
wire [31:0] reg31 ;
layer1 layer1_inst(
.sys_clk (CLK ),
.sys_rst_n (RST_N ),
.PL_interrupt (PL_interrup), //oDone
.reg0 (reg0 ),
.reg1 (reg1 ),
.reg2 (reg2 ),
.reg3 (reg3 ),
.reg4 (reg4 ),
.reg5 (reg5 ),
.reg6 (reg6 ),
.reg7 (reg7 ),
.reg8 (reg8 ),
.reg9 (reg9 ),
.reg10 (reg10 ),
.reg11 (reg11 ),
.reg12 (reg12 ),
.reg13 (reg13 ),
.reg14 (reg14 ),
.reg15 (reg15 ),
.reg16 (reg16 ),
.reg17 (reg17 ),
.reg18 (reg18 ),
.reg19 (reg19 ),
.reg20 (reg20 ),
.reg21 (reg21 ),
.reg22 (reg22 ),
.reg23 (reg23 ),
.reg24 (reg24 ), //5*5 filter
.reg25 (reg25 ), //bias
.reg26 (reg26 ), //bit[7:0] size
.reg27 (reg27 ), //bit[0] isEN
.reg28 (reg28 ), //for debug
.reg29 (reg29 ),
.reg30 (reg30 ),
.reg31 (reg31 ),
.ram_addr (addrb ), //addrb
.wr_out (dinb ), //dinb
.rd_in (doutb ), //doutb
.enb (enb ), //enb
.web (web ) //web
);
reg [7:0] i;
reg [5:0] addra_cnt;
reg rDone;
always@(posedge CLK or negedge RST_N)begin
if(!RST_N)begin
i <= 0;
wea <= 0;
ena <= 0;
addra <= 0;
addra_cnt <= 0;
reg0 <=0;
reg1 <=0;
reg2 <=0;
reg3 <=0;
reg4 <=0;
reg5 <=0;
reg6 <=0;
reg7 <=0;
reg8 <=0;
reg9 <=0;
reg10 <=0;
reg11 <=0;
reg12 <=0;
reg13 <=0;
reg14 <=0;
reg15 <=0;
reg16 <=0;
reg17 <=0;
reg18 <=0;
reg19 <=0;
reg20 <=0;
reg21 <=0;
reg22 <=0;
reg23 <=0;
reg24 <=0;
reg25 <=0;
reg26 <=0;
reg27 <=0;
dina <= 0;
rDone <= 0;
addra_cnt <= 0;
end
else
case(i)
0:begin
i <= i+1;
//wea <= 4'b1111;
//ena <= 1;
end
1:begin
if(addra_cnt == 36)begin
wea <= 0;
ena <= 1;
addra <= 0;
dina <= 0;
addra_cnt <= 0;
i <= i+1;
//i <= 6;
end
else begin
wea <= 4'b1111;
ena <= 1;
addra <= addra_cnt;
dina <= 32'h4000_0000; //2
//dina <= 32'h0; //0
//dina <= 32'h3F80_0000; //1
//dina <= addra_cnt; //same with addra
addra_cnt <= addra_cnt+1;
i <= i;
end
end
2:begin
reg0 <= 32'h4000_0000;
reg1 <= 32'h4000_0000;
reg2 <= 32'h4000_0000;
reg3 <= 32'h4000_0000;
reg4 <= 32'h4000_0000;
reg5 <= 32'h4000_0000;
reg6 <= 32'h4000_0000;
reg7 <= 32'h4000_0000;
reg8 <= 32'h4000_0000;
reg9 <= 32'h4000_0000;
reg10 <= 32'h4000_0000;
reg11 <= 32'h4000_0000;
reg12 <= 32'h4000_0000;
reg13 <= 32'h4000_0000;
reg14 <= 32'h4000_0000;
reg15 <= 32'h4000_0000;
reg16 <= 32'h4000_0000;
reg17 <= 32'h4000_0000;
reg18 <= 32'h4000_0000;
reg19 <= 32'h4000_0000;
reg20 <= 32'h4000_0000;
reg21 <= 32'h4000_0000;
reg22 <= 32'h4000_0000;
reg23 <= 32'h4000_0000;
reg24 <= 32'h4000_0000;
reg25 <= 32'h4000_0000;
reg26 <= 32'd6;
i <= i+1;
end
3:begin
if(PL_interrup)begin
rDone <= 1;
reg27 <= 32'd0;
i <= i+1;
end
else begin
rDone <= 0;
reg27 <= 32'b1;
i <= i;
end
end
4:begin
i <= i;
end
/*
6:begin
if(addra_cnt == 36)begin
wea <= 0;
ena <= 0;
addra <= 0;
addra_cnt <= 0;
i <= 2;
end
else begin
wea <= 4'b0000;
ena <= 1;
addra <= addra_cnt;
addra_cnt <= addra_cnt+1;
i <= i;
end
end
*/
endcase
end
endmodule
仿真
4次地址的缓存
与之对应像素的数值,为方便数制转换为unsigned int
乘法完成后的加法操作
写RAM
验证
工具:浮点在线转换网站
为什么运算完的写入数据都是4000_0000,因为相比偏置4000_0000 (2)来说,递增数太小了,以最大的十进制35为例,
那换一组激励像素为4000_0000(2)2225+2 =102
40 48 F5 C2(3.14) 3.14225+2 = 159 也没问题
总结
接下来要解决32bit address interface选项的问题,它的现象简单地说就是每4个地址都被绑定了一样,对4个地址中的任一个的写操作都会覆盖之前对其中其他几个地址的写操作。第二个是流水化处理,并行加流水才能将PL加速的能力最大程度的发挥出来,不过,我在编码过程中考虑到了流水转换,所以避免了组合逻辑,基本上都是寄存操作,改动量不会很大。