**
日程安排
**
第一天:基于PYNQ开发板的SOC软件开发
第二天:基于FPGA的SOC软硬件开发流程(一)
第三天:基于FPGA的SOC软硬件开发流程(二)
第四天:基于高层次综合的硬件设计(一)
第五天:基于高层次综合的硬件设计(二)
第六天:基于HLS的卷积神经网络软硬件映射
第一天:基于PYNQ开发板的SOC软件开发
- 什么是PYNQ
PYNQ is an open-source project from Xilinx® that makes it easy to design embedded systems with Xilinx Zynq® Systems on Chips (SoCs).
Using the Python language and libraries, designers can exploit the benefits of programmable logic and microprocessors in Zynq to build more capable and exciting embedded systems.
2.结构
3.baseoverlay
PS和PL模块接口单元设计
可将设计好的ip核利用vivado生曾bit流,导入到板子中。就可利用此ip核再PL模块中生成所要的模块电路。
具体下载到板子运行方式:
4.pynq板子的配置过程
1)准备
2).烧写镜像文件
3)同一网段的设置,资源管理器设置,jupyter notebook环境的搭建
pynq板子基本功能实现
1.GPIO的基本实现
from pynq.overlays.base import BaseOverlay
base = BaseOverlay("base.bit")
base.download()
led = base.leds
sw = base.switchs
buttons =base.buttons
led.write(val,mask)
sw.read()
buttons.read()
说明:将之前的嵌入式linux镜像烧入到pynq中,然后将生成的基本ip核烧入,并配置成FPGA的相关 电路(PL部分)。然后取利用各部分小的ip电路,如led灯模块,开关,按键,通过控制其状态,进一步通过外设与之相连,就可控制外设了。
2.音频模块的实现
说明:在jupyter notebook的环境下实现音频模块的采集录制,以及对其进行时域与频谱分析。
3.控制摄像头
4.人脸识别以及通过HDMI输出到显示屏控制。
5.协处理器
第二天:基于FPGA的SOC软硬件开发流程(一)
1.AXI4总线:lite:利用该总线实现对led灯的控制。
说明:上面的masterinterface是站在ARM角度,slaveinterface是站在ip核角度。AR channel表示在该总线下ARM对ip进行都地址操作(ARM传地址给IP),Rd channel表示将该地址下的数据从IP核返回给ARM。AW channel表示ARM对ip核进行写地址操作,Wr channel表示ARM对ip核进行写数据操作。Wr resp表示完成写数据写操作后,ip核对ARM的响应(可忽略)。(上面所描述的也是这种握手型)。
代码:
`timescale 1ns / 1ps
//
// Company:
// Engineer:
//
// Create Date: 2019/08/08 17:16:14
// Design Name:
// Module Name: a31
// Project Name:
// Target Devices:
// Tool Versions:
// Description:
//
// Dependencies:
//
// Revision:
// Revision 0.01 - File Created
// Additional Comments:
//
//
`timescale 1ns / 1ps
module axi_gpio_my
(
input S_AXI_ACLK,
input S_AXI_ARESETN,
inout [3:0]gpio,
//AR channel
input S_AXI_ARVALID,
output S_AXI_ARREADY,
input [4-1:0]S_AXI_ARADDR,
input [2:0]S_AXI_ARPROT,
//Rd channel
output [32-1:0]S_AXI_RDATA,
output [1:0]S_AXI_RRESP,
output S_AXI_RVALID,
input S_AXI_RREADY,
//AW channel
input S_AXI_AWVALID,
output S_AXI_AWREADY,
input [4-1:0]S_AXI_AWADDR,
input [2:0]S_AXI_AWPROT,
//Wr channel
input [32-1:0]S_AXI_WDATA,
input S_AXI_WVALID,
output S_AXI_WREADY,
input [4-1:0]S_AXI_WSTRB,//4'b1111. 4'b0011
//Wr Resp
output [1:0]S_AXI_BRESP,
output S_AXI_BVALID,
input S_AXI_BREADY
);
assign S_AXI_BRESP=2'b0;
reg axi_bvalid;
assign S_AXI_BVALID=axi_bvalid;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
axi_bvalid<=1'b0;
else
if(S_AXI_WVALID&S_AXI_WREADY)
axi_bvalid<=1'b1;
else
if(S_AXI_BREADY)
axi_bvalid<=1'b0;
reg [1:0]addr_word_w;
wire [1:0]addr_word_w_comb;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
addr_word_w<=0;
else
if(S_AXI_AWVALID&S_AXI_AWREADY)
addr_word_w<=S_AXI_AWADDR[3:2];
assign addr_word_w_comb=(S_AXI_AWVALID&S_AXI_AWREADY)?S_AXI_AWADDR[3:2]:addr_word_w;
assign S_AXI_AWREADY=1'b1;
reg [3:0]reg0;
reg [3:0]reg1;
wire [3:0]reg2;
reg wphase;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
wphase<=0;
else
if(S_AXI_AWVALID&S_AXI_AWREADY)
wphase<=1;
else
if(S_AXI_WVALID&S_AXI_WREADY)
wphase<=0;
assign S_AXI_WREADY=wphase;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
begin
reg0<=0;
reg1<=0;
end
else
if(S_AXI_WVALID&S_AXI_WREADY)
case(addr_word_w_comb)
2'd0:begin if(S_AXI_WSTRB[0]) reg0<=S_AXI_WDATA[3:0];end
2'd1:begin if(S_AXI_WSTRB[0]) reg1<=S_AXI_WDATA[3:0];end
endcase
//reg [1:0]addr_word_r;
//wire [1:0]addr_word_r_comb;
//always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
//if(~S_AXI_ARESETN)
// addr_word_r<=0;
//else
// if(S_AXI_ARVALID&S_AXI_ARREADY)
// addr_word_r<=S_AXI_ARADDR[3:2];
//assign addr_word_r_comb=(S_AXI_ARVALID&S_AXI_ARREADY)?S_AXI_ARADDR[3:1]:addr_word_r;
assign S_AXI_ARREADY=1'b1;
assign S_AXI_RRESP=2'b0;
reg [32-1:0]rdata;
assign S_AXI_RDATA=rdata;
reg rvalid;
assign S_AXI_RVALID=rvalid;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
begin rvalid<=1'b0;rdata<=32'b0;end
else
if(S_AXI_ARVALID&S_AXI_ARREADY)
begin
rvalid<=1'b1;
case(S_AXI_ARADDR[3:2])
2'd0:rdata<={28'b0,reg0};
2'd1:rdata<={28'b0,reg1};
2'd2:rdata<={28'b0,reg2};
2'd3:rdata<=0;
endcase
end
else
if(S_AXI_RVALID&S_AXI_RREADY)
rvalid<=1'b0;
assign gpio[3]=reg1[3]?reg0[3]:1'bz;
assign gpio[2]=reg1[2]?reg0[2]:1'bz;
assign gpio[1]=reg1[1]?reg0[1]:1'bz;
assign gpio[0]=reg1[0]?reg0[0]:1'bz;
assign reg2=gpio;
//assign reg2[2]=gpio[2];
//IOBUF IOBUF_1
//(
// .I(reg0[1]),
// .T(~reg1[1]),
// .O(reg2[1]),
// .IO(gpio[1])
//);
//IOBUF IOBUF_0
//(
// .I(reg0[0]),
// .T(~reg1[0]),
// .O(reg2[0]),
// .IO(gpio[0])
//);
endmodule
代码解析:实现了利用AXI lite总线,ARM对led灯的控制。reg1中存放的是有关GPIO是输出还是输入的方向寄存器控制。如果是输出方向的则置1,输入状态就变为高阻态。然后寄存器0表示的是利用输出的GPIO控制灯亮灭的寄存器,1表示亮,0表示灭。寄存器2记录这四个GPIO的状态。地址0,4,8分别表示寄存器0,1,2。ARM上地址是以四为单位变化。握手时只有valid和ready信号都为高时,才有效。只有在读或写完地址之后,才能进行读写数据。然后将上述设计电路打包封装成IP核。
set_property -dict { PACKAGE_PIN R14 IOSTANDARD LVCMOS33 } [get_ports { gpio_0[0] }]; #IO_L6N_T0_VREF_34 Sch=led[0]
set_property -dict { PACKAGE_PIN P14 IOSTANDARD LVCMOS33 } [get_ports { gpio_0[1] }]; #IO_L6P_T0_34 Sch=led[1]
set_property -dict { PACKAGE_PIN N16 IOSTANDARD LVCMOS33 } [get_ports { gpio_0[2] }]; #IO_L21N_T3_DQS_AD14N_35 Sch=led[2]
set_property -dict { PACKAGE_PIN M14 IOSTANDARD LVCMOS33 } [get_ports { gpio_0[3] }]; #IO_L23P_T3_35 Sch=led[3]
约束条件。
block设计
为了对该ip核进行控制,需要将生成的IP核与zynq主控连起来,然后外加ILA产看波形。最后进行综合,布线生成Bit,hwh文件导入板子中。就可以在jupter notebook中完成并控制了。
2.利用AXI4 lite实现串口的ip核设计与控制
利用ARM对串口(设计的IP核)进行读写数据时,由于串口收发数据速度,以及读写串口的速度存在很大差异。因此外加缓冲队列来减小差异,减小读写出错的问题。
代码:
串口收发实现的总模块(ip核设计的):
`timescale 1ns / 1ps
module uart_my
(
input S_AXI_ACLK,
input S_AXI_ARESETN,
input RXD,
output TXD,
//AR channel
input S_AXI_ARVALID,
output S_AXI_ARREADY,
input [3-1:0]S_AXI_ARADDR,
input [2:0]S_AXI_ARPROT,
//Rd channel
output [32-1:0]S_AXI_RDATA,
output [1:0]S_AXI_RRESP,
output S_AXI_RVALID,
input S_AXI_RREADY,
//AW channel
input S_AXI_AWVALID,
output S_AXI_AWREADY,
input [3-1:0]S_AXI_AWADDR,
input [2:0]S_AXI_AWPROT,
//Wr channel
input [32-1:0]S_AXI_WDATA,
input S_AXI_WVALID,
output S_AXI_WREADY,
input [4-1:0]S_AXI_WSTRB,
//Wr Resp
output [1:0]S_AXI_BRESP,
output S_AXI_BVALID,
input S_AXI_BREADY
);
assign S_AXI_BRESP=2'b0;
reg axi_bvalid;
assign S_AXI_BVALID=axi_bvalid;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
axi_bvalid<=1'b0;
else
if(S_AXI_WVALID&S_AXI_WREADY)
axi_bvalid<=1'b1;
else
if(S_AXI_BREADY)
axi_bvalid<=1'b0;
assign S_AXI_ARREADY=1'b1;
assign S_AXI_AWREADY=1'b1;
wire u_tx_fifo_dout_vld;
wire u_tx_fifo_dout_rdy;
wire [7:0]u_tx_fifo_dout;
reg w_phase;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
w_phase<=1'b0;
else
if(S_AXI_AWVALID & S_AXI_AWREADY)
w_phase<=1'b1;
else
if(S_AXI_WVALID & S_AXI_WREADY)
w_phase<=1'b0;
wire u_tx_fifo_vld;
wire u_tx_fifo_rdy;
assign u_tx_fifo_vld=S_AXI_WVALID&w_phase;
assign S_AXI_WREADY=u_tx_fifo_rdy&w_phase;
hs_fifo_64_8 u_tx_fifo
(
.clk(S_AXI_ACLK),
.rst_n(S_AXI_ARESETN),
.din_vld(u_tx_fifo_vld),
.din_rdy(u_tx_fifo_rdy),
.din(S_AXI_WDATA[7:0]),
.dout_vld(u_tx_fifo_dout_vld),
.dout_rdy(u_tx_fifo_dout_rdy),
.dout(u_tx_fifo_dout)
);
uart_send u_uart_send
(
.clk(S_AXI_ACLK), //100 MHz
.rst_n(S_AXI_ARESETN),
.dat_vld(u_tx_fifo_dout_vld),
.dat_rdy(u_tx_fifo_dout_rdy),
.dat(u_tx_fifo_dout),
.TXD(TXD)
);
wire u_uart_recv_dout_vld;
wire [7:0]u_uart_recv_dout;
uart_recv u_uart_recv
(
.clk(S_AXI_ACLK), //100MHz
.rst_n(S_AXI_ARESETN),
.RXD(RXD),
.dat_vld(u_uart_recv_dout_vld),
.dat(u_uart_recv_dout)
);
reg flag;
always @(posedge S_AXI_ACLK or negedge S_AXI_ARESETN)
if(~S_AXI_ARESETN)
flag<=1'b0;
else
if(S_AXI_ARVALID & S_AXI_ARREADY)
flag<=1'b1;
else
if(S_AXI_RVALID & S_AXI_RREADY)
flag<=1'b0;
wire u_rx_fifo_dout_vld;//S_AXI_RVALID
wire u_rx_fifo_dout_rdy;//S_AXI_RREADY
assign S_AXI_RVALID = u_rx_fifo_dout_vld&flag;
assign u_rx_fifo_dout_rdy = S_AXI_RREADY&flag;
hs_fifo_64_8 u_rx_fifo
(
.clk(S_AXI_ACLK),
.rst_n(S_AXI_ARESETN),
.din_vld(u_uart_recv_dout_vld),
.din_rdy(),
.din(u_uart_recv_dout),
// .din_vld(u_tx_fifo_dout_vld),
// .din_rdy(u_tx_fifo_dout_rdy),
// .din(u_tx_fifo_dout),
.dout_vld(u_rx_fifo_dout_vld),
.dout_rdy(u_rx_fifo_dout_rdy),
.dout(S_AXI_RDATA[7:0])
);
assign S_AXI_RRESP=2'b0;
assign S_AXI_RDATA[31:8]=0;
endmodule
写队列以及完成握手信号与队列信号的转化
队列的生成,数据宽度是八位,可以存储的容量是1024个数。选择first world fall through 即读写数据时,时钟上升沿到来时,数据在立刻送入或从队列取出数据。而standard FIFO是此刻上升沿过后的下一个时钟队列来读或取数据。
`timescale 1ns / 1ps
module hs_fifo_64_8
(
input clk,
input rst_n,
input din_vld,
output din_rdy,
input [7:0]din,
output dout_vld,
input dout_rdy,
output [7:0]dout
);
wire fifo_full;
wire fifo_empty;
fifo_fw_8x64 u_fifo_fw_8x64
(
.clk(clk), //: IN STD_LOGISC;
.rst(~rst_n), //: IN STD_LOGIC;
.din(din), //: IN STD_LOGIC_VECTOR(7 DOWNTO 0);
.wr_en(din_vld&din_rdy), //: IN STD_LOGIC;
.rd_en(dout_vld&dout_rdy), //: IN STD_LOGIC;
.dout(dout), //: OUT STD_LOGIC_VECTOR(7 DOWNTO 0);
.full(fifo_full), //: OUT STD_LOGIC;
.empty(fifo_empty) //: OUT STD_LOGIC
);
assign din_rdy=~fifo_full;
assign dout_vld=~fifo_empty;
endmodule
代码解析:因为对于队列,BRAM其对应的控制信号是wr_en,rd_en,full,empty类型的FIFO接口,而对于通用来说ip核设计的,以及ARM和ip之间都是握手型信号接口。即din_vld,din_rdy,dout_vld,dout_rdy。对于写队列转化来说:wr_en=din_vld&din_rdy,din_rdy=~fifo_full。
对于读队列来说:rd_en=(dout_vld&dout_rdy), dout_vld=~fifo_empty存在这样的转化关系。
串口发数据(ip核内部)
`timescale 1ns / 1ps
module uart_send
(
input clk, //100 MHz
input rst_n,
input dat_vld,
output dat_rdy,
input [7:0]dat,
output TXD
);
wire check_bit=^dat;
reg [9:0]cnt;
wire cnt_ovfl=(cnt==10'd868);
always @(posedge clk or negedge rst_n)
if(~rst_n)
cnt<=0;
else
if(dat_vld)
begin
if(cnt_ovfl)
cnt<=0;
else
cnt<=cnt+1;
end
else
cnt<=0;
reg [2:0]state;
reg [2:0]dat_bit_cnt;
reg txd;
assign TXD=txd;
always @(posedge clk or negedge rst_n)
if(~rst_n)
begin state<=0;txd<=1'b1;dat_bit_cnt<=0;end
else
case(state)
0://idle
if(dat_vld)
state<=1;
1://start bit
begin
txd<=1'b0;
if(cnt_ovfl)
state<=2;
end
2://data bit
begin
txd<=dat[dat_bit_cnt];
if(cnt_ovfl)
begin
dat_bit_cnt<=dat_bit_cnt+1;
if(dat_bit_cnt==7)
begin dat_bit_cnt<=0;state<=3;end
end
end
3://check bit
begin
txd<=check_bit;
if(cnt_ovfl)
state<=4;
end
4://stop bit
begin
txd<=1'b1;
if(cnt_ovfl)
state<=5;
end
5://idle bit
begin
txd<=1'b1;
if(cnt_ovfl)
state<=0;
end
endcase
assign dat_rdy=(state==4)&cnt_ovfl;
endmodule
`timescale 1ns / 1ps
串口收内部
module uart_recv
(
input clk, //100MHz
input rst_n,
input RXD,
output reg dat_vld,
output reg [7:0]dat
);
reg [2:0]state;
reg [9:0]cnt;
wire cnt_ovfl=(cnt==10'd868);
wire start_bit_recv_error=(state==1)&(RXD==1)&(cnt<700);
wire stop_bit_recv_error=(state==4)&(RXD==0)&(cnt>100)&(cnt<700);
always @(posedge clk or negedge rst_n)
if(~rst_n)
cnt<=0;
else
if(state!=0)
begin
if(cnt_ovfl)
cnt<=0;
else
cnt<=cnt+1;
end
else
cnt<=0;
reg [2:0]dat_bit_cnt;
reg [7:0]recv_data;reg check_bit;
always @(posedge clk or negedge rst_n)
if(~rst_n)
begin
state<=0;
dat_bit_cnt<=0;
recv_data<=0;
check_bit<=0;
end
else
case(state)
0:
if(RXD==0)
state<=1;
1://start bit
if(cnt_ovfl)
state<=2;
else
if(start_bit_recv_error)
state<=0;
2://data bit
begin
if(cnt==434)
recv_data[dat_bit_cnt]<=RXD;
if(cnt_ovfl)
begin
dat_bit_cnt<=dat_bit_cnt+1;
if(dat_bit_cnt==7)
state<=3;
end
end
3://check bit
begin
if(cnt==434)
check_bit<=RXD;
if(cnt_ovfl)
state<=4;
end
4://stop bit
begin
if(stop_bit_recv_error)
state<=0;
else
if(cnt_ovfl)
state<=5;
end
5:
state<=0;
endcase
wire dat_vld_d=(state==5)&&(check_bit==(^recv_data));
always @(posedge clk or negedge rst_n)
if(~rst_n)
dat_vld<=1'b0;
else
dat_vld<=dat_vld_d;
always @(posedge clk or negedge rst_n)
if(~rst_n)
dat<=0;
else
if(dat_vld_d)
dat<=recv_data;
endmodule
代码解析:串口收发数据位设为8位,一位校验位,一位停止位,868计数器是控制波特率,大约在115200;状态的切换,开始时状态0表示空闲状态。设立计数器实现分频的作用。采样时在一个周期的正中间完成采样。因为对于数据的稳定性较好。
读队列
`timescale 1ns / 1ps
module hs_fifo_64_8
(
input clk,
input rst_n,
input din_vld,
output din_rdy,
input [7:0]din,
output dout_vld,
input dout_rdy,
output [7:0]dout
);
wire fifo_full;
wire fifo_empty;
fifo_fw_8x64 u_fifo_fw_8x64
(
.clk(clk), //: IN STD_LOGISC;
.rst(~rst_n), //: IN STD_LOGIC;
.din(din), //: IN STD_LOGIC_VECTOR(7 DOWNTO 0);
.wr_en(din_vld&din_rdy), //: IN STD_LOGIC;
.rd_en(dout_vld&dout_rdy), //: IN STD_LOGIC;
.dout(dout), //: OUT STD_LOGIC_VECTOR(7 DOWNTO 0);
.full(fifo_full), //: OUT STD_LOGIC;
.empty(fifo_empty) //: OUT STD_LOGIC
);
assign din_rdy=~fifo_full;
assign dout_vld=~fifo_empty;
endmodule
整体框架实现(arm与串口的连接控制)
测试过程。
第三天:基于FPGA的SOC软硬件开发流程(二)
1.DMA IP核设计
AXI4总线:full
代码:
1.DMA总设计模块
`timescale 1ns / 1ps
module dma #
(
parameter C_AXI_DATA_WIDTH=32,
parameter C_M_AXI_ID_WIDTH=4
)
(
input clk,
input rst_n,
//AR channel
input S_AXI_ARVALID,
output S_AXI_ARREADY,
input [4-1:0]S_AXI_ARADDR,
input [2:0]S_AXI_ARPROT,
//Rd channel
output [32-1:0]S_AXI_RDATA,
output [1:0]S_AXI_RRESP,
output S_AXI_RVALID,
input S_AXI_RREADY,
//AW channel
input S_AXI_AWVALID,
output S_AXI_AWREADY,
input [4-1:0]S_AXI_AWADDR,
input [2:0]S_AXI_AWPROT,
//Wr channel
input [32-1:0]S_AXI_WDATA,
input S_AXI_WVALID,
output S_AXI_WREADY,
input [4-1:0]S_AXI_WSTRB,
//Wr Resp
output [1:0]S_AXI_BRESP,
output S_AXI_BVALID,
input S_AXI_BREADY,
//AR channel
output [C_M_AXI_ID_WIDTH-1 : 0] M_AXI_ARID,
output [32-1 : 0] M_AXI_ARADDR,
output [7 : 0] M_AXI_ARLEN,
output [2 : 0] M_AXI_ARSIZE,//=clogb2((`AXI_DATA_WIDTH/8)-1);
output [1 : 0] M_AXI_ARBURST,//=2'b01;
output M_AXI_ARLOCK,//=1'b0;
output [3 : 0] M_AXI_ARCACHE,//=4'b0010;
output [2 : 0] M_AXI_ARPROT,//=3'h0;
output [3 : 0] M_AXI_ARQOS,//=4'h0;
output M_AXI_ARVALID,
input M_AXI_ARREADY,
//Rd channel
input [C_M_AXI_ID_WIDTH-1 : 0] M_AXI_RID,
input [C_AXI_DATA_WIDTH-1 : 0] M_AXI_RDATA,
input [1 : 0] M_AXI_RRESP,//ignore
input M_AXI_RLAST,
input M_AXI_RVALID,
output M_AXI_RREADY,
//AW channel
output [C_M_AXI_ID_WIDTH-1 : 0] M_AXI_AWID,
output [32-1 : 0] M_AXI_AWADDR,
output [7 : 0] M_AXI_AWLEN,
output [2 : 0] M_AXI_AWSIZE,//=clogb2((`AXI_DATA_WIDTH/8)-1);
output [1 : 0] M_AXI_AWBURST,//=2'b01;
output M_AXI_AWLOCK,//1'b0;
output [3 : 0] M_AXI_AWCACHE,//=4'b0010
output [2 : 0] M_AXI_AWPROT,//=3'h0;
output [3 : 0] M_AXI_AWQOS,//=4'h0;
output M_AXI_AWVALID,
input M_AXI_AWREADY,
//Wr channel
output [C_AXI_DATA_WIDTH-1 : 0] M_AXI_WDATA,
output [C_AXI_DATA_WIDTH/8-1 : 0] M_AXI_WSTRB,
output M_AXI_WLAST,
output M_AXI_WVALID,
input M_AXI_WREADY,
//Resp channel
input [C_M_AXI_ID_WIDTH-1 : 0] M_AXI_BID,//ignore
input [1 : 0] M_AXI_BRESP,//ignore
input M_AXI_BVALID,//Bvalid and Bread means a a write response.
output M_AXI_BREADY//Bvalid and Bread means a a write response.
);
function integer clogb2 (input integer bit_depth);
begin
for(clogb2=0; bit_depth>0; clogb2=clogb2+1)
bit_depth = bit_depth >> 1;
end
endfunction
assign M_AXI_AWSIZE=clogb2((C_AXI_DATA_WIDTH/8)-1);
assign M_AXI_AWBURST=2'b01;
assign M_AXI_AWLOCK=1'b0;
assign M_AXI_AWCACHE=4'b0010;
assign M_AXI_AWPROT=3'h0;
assign M_AXI_AWQOS=4'h0;
assign M_AXI_ARSIZE=clogb2((C_AXI_DATA_WIDTH/8)-1);
assign M_AXI_ARBURST=2'b01;
assign M_AXI_ARLOCK=1'b0;
assign M_AXI_ARCACHE=4'b0010;
assign M_AXI_ARPROT=3'h0;
assign M_AXI_ARQOS=4'h0;
assign M_AXI_ARID=0;
assign M_AXI_AWID=0;
assign M_AXI_BREADY=1'b1;
wire start,done;
wire [31:0]src_addr;
wire [31:0]dst_addr;
wire [15:0]size;
assign M_AXI_WDATA=M_AXI_RDATA;
assign M_AXI_WSTRB={(C_AXI_DATA_WIDTH/8){1'b1}};
assign M_AXI_WLAST=M_AXI_RLAST;
assign M_AXI_WVALID=M_AXI_RVALID;
assign M_AXI_RREADY=M_AXI_WREADY;
dma_reg u_dma_reg
(
.clk(clk),
.rst_n(rst_n),
//AR channel
.S_AXI_ARVALID(S_AXI_ARVALID),
.S_AXI_ARREADY(S_AXI_ARREADY),
.S_AXI_ARADDR(S_AXI_ARADDR),
.S_AXI_ARPROT(S_AXI_ARPROT),
//Rd channel
.S_AXI_RDATA(S_AXI_RDATA),
.S_AXI_RRESP(S_AXI_RRESP),
.S_AXI_RVALID(S_AXI_RVALID),
.S_AXI_RREADY(S_AXI_RREADY),
//AW channel
.S_AXI_AWVALID(S_AXI_AWVALID),
.S_AXI_AWREADY(S_AXI_AWREADY),
.S_AXI_AWADDR(S_AXI_AWADDR),
.S_AXI_AWPROT(S_AXI_AWPROT),
//Wr channel
.S_AXI_WDATA(S_AXI_WDATA),
.S_AXI_WVALID(S_AXI_WVALID),
.S_AXI_WREADY(S_AXI_WREADY),
.S_AXI_WSTRB(S_AXI_WSTRB),
//Wr Resp
.S_AXI_BRESP(S_AXI_BRESP),
.S_AXI_BVALID(S_AXI_BVALID),
.S_AXI_BREADY(S_AXI_BREADY),
.start(start),
.done(done),
.src_addr(src_addr),
.dst_addr(dst_addr),
.size(size)
);
send_rd_cmd #
(
.C_AXI_DATA_WIDTH(C_AXI_DATA_WIDTH)
)u_send_rd_cmd
(
.clk(clk),
.rst_n(rst_n),
.start(start),
.src_addr(src_addr),
.size(size),//real_size - 1
//AR channel
.M_AXI_ARADDR(M_AXI_ARADDR),
.M_AXI_ARLEN(M_AXI_ARLEN),
.M_AXI_ARVALID(M_AXI_ARVALID),
.M_AXI_ARREADY(M_AXI_ARREADY)
);
send_wr_cmd #
(
.C_AXI_DATA_WIDTH(32)
)u_send_wr_cmd
(
.clk(clk),
.rst_n(rst_n),
.start(start),
.dst_addr(dst_addr),
.size(size),//real_size - 1
//AR channel
.M_AXI_AWADDR(M_AXI_AWADDR),
.M_AXI_AWLEN(M_AXI_AWLEN),
.M_AXI_AWVALID(M_AXI_AWVALID),
.M_AXI_AWREADY(M_AXI_AWREADY)
);
reg [15:0]cnt;
always @(posedge clk or negedge rst_n)
if(~rst_n)
cnt<=0;
else
if(start)
cnt<=0;
else
if(M_AXI_WVALID&M_AXI_WREADY)
cnt<=cnt+1;
assign done=(M_AXI_WVALID&M_AXI_WREADY)&(cnt==size);
endmodule
2.相关寄存器配置
`timescale 1ns / 1ps
module dma_reg
(
input clk,
input rst_n,
//AR channel
input S_AXI_ARVALID,
output S_AXI_ARREADY,
input [4-1:0]S_AXI_ARADDR,
input [2:0]S_AXI_ARPROT,
//Rd channel
output [32-1:0]S_AXI_RDATA,
output [1:0]S_AXI_RRESP,
output S_AXI_RVALID,
input S_AXI_RREADY,
//AW channel
input S_AXI_AWVALID,
output S_AXI_AWREADY,
input [4-1:0]S_AXI_AWADDR,
input [2:0]S_AXI_AWPROT,
//Wr channel
input [32-1:0]S_AXI_WDATA,
input S_AXI_WVALID,
output S_AXI_WREADY,
input [4-1:0]S_AXI_WSTRB,
//Wr Resp
output [1:0]S_AXI_BRESP,
output S_AXI_BVALID,
input S_AXI_BREADY,
output start,//reg0
input done,//reg0
output reg [31:0]src_addr,//reg1
output reg [31:0]dst_addr,//reg2
output reg [15:0]size//reg3
);
reg done_r;
assign S_AXI_BRESP=2'b0;
reg axi_bvalid;
assign S_AXI_BVALID=axi_bvalid;
always @(posedge clk or negedge rst_n)
if(~rst_n)
axi_bvalid<=1'b0;
else
if(S_AXI_WVALID&S_AXI_WREADY)
axi_bvalid<=1'b1;
else
if(S_AXI_BREADY)
axi_bvalid<=1'b0;
reg [1:0]addr_word_w;
wire [1:0]addr_word_w_comb;
always @(posedge clk or negedge rst_n)
if(~rst_n)
addr_word_w<=0;
else
if(S_AXI_AWVALID&S_AXI_AWREADY)
addr_word_w<=S_AXI_AWADDR[3:2];
assign addr_word_w_comb=(S_AXI_AWVALID&S_AXI_AWREADY)?S_AXI_AWADDR[3:2]:addr_word_w;
assign S_AXI_AWREADY=1'b1;//S_AXI_AWVALID&S_AXI_WVALID;
reg w_phase;
always @(posedge clk or negedge rst_n)
if(~rst_n)
w_phase=1'b0;
else
if(S_AXI_AWVALID&S_AXI_AWREADY)
w_phase<=1;
else
if(S_AXI_WVALID&S_AXI_WREADY)
w_phase<=0;
assign S_AXI_WREADY=w_phase;
always @(posedge clk or negedge rst_n)
if(~rst_n)
begin
src_addr<=0;
dst_addr<=0;
size<=0;
end
else
if(S_AXI_WVALID&S_AXI_WREADY)
case(addr_word_w_comb)
2'd1:begin src_addr<=S_AXI_WDATA;end
2'd2:begin dst_addr<=S_AXI_WDATA;end
2'd3:begin size<=S_AXI_WDATA[15:0];end
endcase
assign S_AXI_ARREADY=1'b1;
assign S_AXI_RRESP=2'b0;
reg [32-1:0]rdata;
assign S_AXI_RDATA=rdata;
reg rvalid;
assign S_AXI_RVALID=rvalid;
always @(posedge clk or negedge rst_n)
if(~rst_n)
begin rvalid<=1'b0;rdata<=32'b0;end
else
if(S_AXI_ARVALID&S_AXI_ARREADY)
begin
rvalid<=1'b1;
case(S_AXI_ARADDR[3:2])
2'd0:rdata<={30'b0,done_r,1'b0};
2'd1:rdata<=src_addr;
2'd2:rdata<=dst_addr;
2'd3:rdata<={16'b0,size};
endcase
end
else
if(S_AXI_RVALID&S_AXI_RREADY)
rvalid<=1'b0;
assign start=S_AXI_WVALID&S_AXI_WREADY&(addr_word_w_comb==0)&S_AXI_WDATA[0];
always @(posedge clk or negedge rst_n)
if(~rst_n)
done_r<=0;
else
if(start)
done_r<=0;
else
if(done)
done_r<=1;
endmodule
3.DMA读内存中数据控制
`timescale 1ns / 1ps
module send_wr_cmd #
(
parameter C_AXI_DATA_WIDTH=32
)
(
input clk,
input rst_n,
input start,
input [31:0]dst_addr,
input [15:0]size,//real_size - 1
//AR channel
output [32-1 : 0] M_AXI_AWADDR,
output [7 : 0] M_AXI_AWLEN,
output M_AXI_AWVALID,
input M_AXI_AWREADY
);
reg state;
reg [15:0]ptr;
reg awvalid;
reg [31:0]waddr;
wire last_burst=(ptr[15:8]==size[15:8]);
assign M_AXI_AWLEN=last_burst?size[7:0]:8'hff;
assign M_AXI_AWVALID=awvalid;
assign M_AXI_AWADDR=waddr;
always @(posedge clk or negedge rst_n)
if(~rst_n)
begin
ptr<=0;
state<=0;
awvalid<=0;
waddr<=0;
end
else
case(state)
0://idle
if(start)
begin
state<=1;
awvalid<=1;
waddr<=dst_addr;
end
1:
if(M_AXI_AWVALID&M_AXI_AWREADY)//send a wr cmd
begin
if(last_burst)
begin
awvalid<=0;
waddr<=0;
ptr<=0;
state<=0;
end
else
begin
waddr<=waddr+(C_AXI_DATA_WIDTH/8)*256;//1K, 2k
ptr<=ptr+256;
end
end
endcase
endmodule
4.DMA向内存写数据控制
`timescale 1ns / 1ps
module send_wr_cmd #
(
parameter C_AXI_DATA_WIDTH=32
)
(
input clk,
input rst_n,
input start,
input [31:0]dst_addr,
input [15:0]size,//real_size - 1
//AR channel
output [32-1 : 0] M_AXI_AWADDR,
output [7 : 0] M_AXI_AWLEN,
output M_AXI_AWVALID,
input M_AXI_AWREADY
);
reg state;
reg [15:0]ptr;
reg awvalid;
reg [31:0]waddr;
wire last_burst=(ptr[15:8]==size[15:8]);
assign M_AXI_AWLEN=last_burst?size[7:0]:8'hff;
assign M_AXI_AWVALID=awvalid;
assign M_AXI_AWADDR=waddr;
always @(posedge clk or negedge rst_n)
if(~rst_n)
begin
ptr<=0;
state<=0;
awvalid<=0;
waddr<=0;
end
else
case(state)
0://idle
if(start)
begin
state<=1;
awvalid<=1;
waddr<=dst_addr;
end
1:
if(M_AXI_AWVALID&M_AXI_AWREADY)//send a wr cmd
begin
if(last_burst)
begin
awvalid<=0;
waddr<=0;
ptr<=0;
state<=0;
end
else
begin
waddr<=waddr+(C_AXI_DATA_WIDTH/8)*256;//1K, 2k
ptr<=ptr+256;
end
end
endcase
endmodule
代码解析:总体完成的内容是通过ARM给内存开辟两块内存空间,分别作为起始和结束地址,然后将地址,发送给设计的DMA(ip核),以及通过ARM将读取和写入的长度告诉ip。分别用寄存器0控制起始结束,寄存器1控制原地址,寄存器2控制目标地址。寄存器三控制读写长度。
说明;在jupyter notebook中对设计的DMA ip核进行测试,通过开两块存储区,分别存放两个数组中的数据,然后利用DMA实现将一块存储区的数据内容搬移到另一块存储区中。并比较搬过去之后对比原来的数据,判断其正确性。
第四天:基于高层次综合的硬件设计(一)
1.芯片的设计流程
2.基于标准单元的芯片综合流程
3.逻辑综合:RTL->门级网表
4.HLS原理
5.HLS原理(pipeline)
6.HLS优化策略
基于HLS,vivado上ip核设计
7.矩阵乘法运算的ip核设计(hls)
代码:
#include<stdio.h>
#include "ap_fixed.h"
void matrix(ap_int<8> a[4][4],ap_int<8> b[4][4],ap_int<16> c[4][4])
{
#pragma HLS INTERFACE s_axilite port=return
#pragma HLS INTERFACE s_axilite port=b
#pragma HLS INTERFACE s_axilite port=c
#pragma HLS INTERFACE s_axilite port=a
//#pragma HLS ARRAY_PARTITION variable=a complete dim=2
// #pragma HLS ARRAY_PARTITION variable=b complete dim=1
#pragma HLS ARRAY_RESHAPE variable=a complete dim=2
#pragma HLS ARRAY_RESHAPE variable=b complete dim=1
int i,j,k;
for( i=0;i<4;i++)
for(j=0;j<4;j++)
{
#pragma HLS PIPELINE
c[i][j]=0;
for(k=0;k<4;k++)
//#pragma HLS UNROLL
c[i][j] = c[i][j]+a[i][k]*b[k][j];
}
}
代码解析:实现矩阵乘法。//#pragma HLS UNROLL实现的是在循环内部,可以并行的取数据,计算。保存数据,每一个时钟周期并行的完成。#pragma HLS PIPELINE 也是相当于将最后一个循环看成一个整体,并行的取数据,运算,保存数据。优化的时候两者都可以,也能一起用。//#pragma HLS ARRAY_PARTITION variable=a complete dim=2 实现的是矩阵按第二维中的(四个行数据分为一组,形成一个端口)分组,形成四个端口。#pragma HLS ARRAY_RESHAPE variable=a complete dim=2 reshape实现的是将矩阵a中的四个数据合并为一个32位的数据。然后参与并行运算,也是分为四行,但不改变端口数,只有一个。
testbench:
#include<stdio.h>
#include "ap_fixed.h"
#include "b.h"
#include<iostream>
int i,j,k;
ap_int<8> a[4][4];
ap_int<8> b[4][4];
ap_int<16> c[4][4];
int main()
{
for(i=0;i<4;i++)
for(j=0;j<4;j++)
{
a[i][j]=4;
b[i][j]=8;
}
matrix(a,b,c);
for(i=0;i<4;i++)
for(j=0;j<4;j++)
{
std::cout<<c[i][j]<<std::endl;
}
}
生成ip核 并利用ARM完成控制
在测试中,声明两个int8数组,然后分别对a,b进行按行和列的拼接,拼成一个32位的数据。然后启动运算ee。最后依次从指定地址的地方读取数据。
第五天:基于高层次综合的硬件设计(二)
1.对于stream的建模(hls::stream)
利用hls stream实现两个小ip核运算,并封成一个大的ip核
#include "a1.h"
void inverse(hls::stream<zheng_t> &in,hls::stream<zheng_t> &out,int length)
{
zheng_t tp;
for (int i=0;i<length;i++)
{
#pragma HLS LOOP_TRIPCOUNT min=5000 max=5000 avg=5000
tp=in.read();
out.write(-tp);
}
}
void add2to1(hls::stream<zheng_t> &in,hls::stream<zheng_t> &out,int length)
{
zheng_t tp1;
zheng_t tp2;
for(int j=0;j<length/2;j++)
{
#pragma HLS LOOP_TRIPCOUNT min=2500 max=2500 avg=2500
tp1=in.read();
tp2=in.read();
out.write(tp1+tp2);
}
}
void test(hls::stream<zheng_t> &in,hls::stream<zheng_t> &out,int length)
{
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_hs port=out
#pragma HLS INTERFACE ap_hs port=in
hls::stream<zheng_t> mm;
inverse(in,mm,length);
add2to1(mm,out,length);
}
2.池化单元的设计
代码:
#include "chihua.h"
void pool(hls::stream<dtype_bus> &in,hls::stream<dtype_stream> &out,
int ch_div_K,int height_in,int width_in,
int height_out,int width_out,int Kx,int Ky)
{
#pragma HLS INTERFACE s_axilite port=return
#pragma HLS INTERFACE s_axilite port=Ky
#pragma HLS INTERFACE s_axilite port=width_in
#pragma HLS INTERFACE s_axilite port=Kx
#pragma HLS INTERFACE s_axilite port=height_in
#pragma HLS INTERFACE s_axilite port=height_out
#pragma HLS INTERFACE s_axilite port=width_out
#pragma HLS INTERFACE s_axilite port=ch_div_K
#pragma HLS DATAFLOW
#pragma HLS INTERFACE axis register both port=out
#pragma HLS INTERFACE axis register both port=in
hls::stream<dtype_bus> stream_tp;
#pragma HLS STREAM variable=stream_tp depth=8 dim=1
hls::stream<dtype_bus> stream_tp2;
pool_1D(in,stream_tp,ch_div_K,height_in,width_in,Kx);
pool_2D(stream_tp,stream_tp2,ch_div_K,height_in,width_out,Ky);
hs2axis(stream_tp2,out,ch_div_K,height_out,width_out);
}
void hs2axis(hls::stream<dtype_bus> &in,hls::stream<dtype_stream> &out,int ch_div_K,int height_out,int width_out)
{
#pragma HLS INTERFACE ap_stable port=height_out
#pragma HLS INTERFACE ap_stable port=width_out
#pragma HLS INTERFACE ap_stable port=ch_div_K
for(int i=0;i<height_out*width_out*ch_div_K;i++)
{
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_TRIPCOUNT min=200 max=200 avg=200
dtype_stream tp;
tp.data=in.read();
if(i==(height_out*width_out*ch_div_K-1))
tp.last=1;
else
tp.last=0;
out.write(tp);
}
}
void pool_1D(hls::stream<dtype_bus> &in,hls::stream<dtype_bus> &out,int ch_div_K,int height_in,int width_in,int Kx)
{
// #pragma HLS INTERFACE axis register both port=out
// #pragma HLS INTERFACE axis register both port=in
#pragma HLS INTERFACE ap_stable port=width_in
#pragma HLS INTERFACE ap_stable port=Kx
#pragma HLS INTERFACE ap_stable port=height_in
#pragma HLS INTERFACE ap_stable port=ch_div_K
dtype_bus in_d0,in_d1,in_d2,in_d3;
for(int c=0;c<ch_div_K;c++)
{
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
for(int i=0;i<height_in;i++)
{
#pragma HLS LOOP_TRIPCOUNT min=20 max=20 avg=20
for(int j=0;j<width_in;j++)
{
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_TRIPCOUNT min=20 max=20 avg=20
in_d3=in_d2;in_d2=in_d1;in_d1=in_d0;
in_d0=in.read();
dtype_bus tp_out;
if((j+1)%Kx==0)//if need output
{
for(int k=0;k<K;k++)
{
dtype_dat tp_d0=in_d0.range(16*k+15,16*k);
dtype_dat tp_d1=in_d1.range(16*k+15,16*k);
dtype_dat tp_d2=in_d2.range(16*k+15,16*k);
dtype_dat tp_d3=in_d3.range(16*k+15,16*k);
switch(Kx)
{
case 1:{tp_out.range(16*k+15,16*k)=tp_d0;break;}
case 2:{tp_out.range(16*k+15,16*k)=MAX(tp_d0,tp_d1);break;}
case 3:{tp_out.range(16*k+15,16*k)=MAX(tp_d0,MAX(tp_d1,tp_d2));break;}
case 4:{tp_out.range(16*k+15,16*k)=MAX(tp_d0,MAX(tp_d1,MAX(tp_d2,tp_d3)));break;}
}
}
out.write(tp_out);
}
}
}
}
}
void pool_2D(hls::stream<dtype_bus> &in,hls::stream<dtype_bus> &out,int ch_div_K,int height_in,int width_out,int Ky)
{
// #pragma HLS INTERFACE axis register both port=out
// #pragma HLS INTERFACE axis register both port=in
#pragma HLS INTERFACE ap_stable port=height_in
#pragma HLS INTERFACE ap_stable port=Ky
#pragma HLS INTERFACE ap_stable port=width_out
#pragma HLS INTERFACE ap_stable port=ch_div_K
dtype_bus buf[POOL_2D_BUF_DEP];
#pragma HLS ARRAY_PARTITION variable=buf complete dim=1
int ptr=0;
for(int c=0;c<ch_div_K;c++)
{
#pragma HLS LOOP_TRIPCOUNT min=1 max=1 avg=1
for(int i=0;i<height_in;i++)
{
#pragma HLS LOOP_TRIPCOUNT min=20 max=20 avg=20
for(int j=0;j<width_out;j++)
{
#pragma HLS PIPELINE II=1
#pragma HLS LOOP_TRIPCOUNT min=10 max=10 avg=10
buf[ptr]=in.read();
if((i+1)%Ky==0)//if need output
{
int ptr_tp=ptr;
dtype_bus in_d0,in_d1,in_d2,in_d3;
in_d0=buf[ptr_tp];
if(ptr_tp<width_out)
ptr_tp=ptr_tp-width_out+POOL_2D_BUF_DEP;
else
ptr_tp=ptr_tp-width_out;
in_d1=buf[ptr_tp];
if(ptr_tp<width_out)
ptr_tp=ptr_tp-width_out+POOL_2D_BUF_DEP;
else
ptr_tp=ptr_tp-width_out;
in_d2=buf[ptr_tp];
if(ptr_tp<width_out)
ptr_tp=ptr_tp-width_out+POOL_2D_BUF_DEP;
else
ptr_tp=ptr_tp-width_out;
in_d3=buf[ptr_tp];
dtype_bus tp_out;
for(int k=0;k<K;k++)
{
dtype_dat tp_d0=in_d0.range(16*k+15,16*k);
dtype_dat tp_d1=in_d1.range(16*k+15,16*k);
dtype_dat tp_d2=in_d2.range(16*k+15,16*k);
dtype_dat tp_d3=in_d3.range(16*k+15,16*k);
switch(Ky)
{
case 1:{tp_out.range(16*k+15,16*k)=tp_d0;break;}
case 2:{tp_out.range(16*k+15,16*k)=MAX(tp_d0,tp_d1);break;}
case 3:{tp_out.range(16*k+15,16*k)=MAX(tp_d0,MAX(tp_d1,tp_d2));break;}
case 4:{tp_out.range(16*k+15,16*k)=MAX(tp_d0,MAX(tp_d1,MAX(tp_d2,tp_d3)));break;}
}
}
out.write(tp_out);
}
if(ptr==POOL_2D_BUF_DEP-1)
ptr=0;
else
ptr++;
}
}
}
}
代码解析:总体设计思路是先完成水平池化,然后完成垂直方向的池化。 void hs2axis函数的引入是由于其和DMA模块连接时,要有last数据接口引入的扩充接口的Ip核设计。#pragma HLS DATAFLOW的引入时优化HLS stream流。#pragma HLS LOOP_TRIPCOUNT min=20 max=20 avg=20这是可以在综合时候限定一下循环长度,便于看综合结果,不影响测试。#pragma HLS INTERFACE ap_stable port=height_out声明位staple端口不会对测试有影响。如果是ap_hs握手型端口,则只能是放在最顶层模块。不能作为中间模块的输出端口。
头文件:
#ifndef __POOL_H__
#define __POOL_H__
#include <iostream>
#include <hls_stream.h>
#include <ap_int.h>
#define MAX(A,B) ((A>B)?A:B)
#define K 8
#define POOL_2D_BUF_DEP 64
typedef ap_int<16> dtype_dat;
typedef ap_int<16*K> dtype_bus;
typedef struct
{
dtype_bus data;
bool last;
}dtype_stream;
void pool(hls::stream<dtype_bus> &in,hls::stream<dtype_stream> &out,int ch_div_K,int height_in,int width_in,int height_out,int width_out,int Kx,int Ky);
void pool_1D(hls::stream<dtype_bus> &in,hls::stream<dtype_bus> &out,int ch_div_K,int height_in,int width_in,int Kx);
void pool_2D(hls::stream<dtype_bus> &in,hls::stream<dtype_bus> &out,int ch_div_K,int height_in,int width_out,int Ky);
void hs2axis(hls::stream<dtype_bus> &in,hls::stream<dtype_stream> &out,int ch_div_K,int height_out,int width_out);
#endif
测试文件:
#include "chihua.h"
int main(void)
{
hls::stream<dtype_bus> in;
hls::stream<dtype_stream> out;
int ch_div_K=1;
int height_in=40;
int width_in=40;
int Kx=1;
int Ky=1;
for(int c=0;c<ch_div_K;c++)
{
for(int i=0;i<height_in*width_in;i++)
{
dtype_bus tp;
for(int j=0;j<K;j++)
tp.range(16*j+15,16*j)=i;
in.write(tp);
}
}
pool(in,out,ch_div_K,height_in,width_in,height_in/Ky,width_in/Kx,Kx,Ky);
for(int c=0;c<ch_div_K;c++)
{
for(int i=0;i<(height_in/Ky)*(width_in/Kx);i++)
{
dtype_stream tp=out.read();
std::cout<<"out:"<<i/(width_in/Kx)<<","<<i%(width_in/Kx)<<":"<<tp.data.toStringUnsigned(16)<<",last="<<tp.last<<std::endl;
}
}
}
3.卷积ip核的设计与实现
代码:
#include "conv.h"
//Feature: [C/K][H][W][K]
//kernel: [CHout][Ky][Kx][CHin/K][K]
void Conv(ap_uint<16> CHin,ap_uint<16> Hin,ap_uint<16> Win,ap_uint<16> CHout,
ap_uint<8> Kx,ap_uint<8> Ky,ap_uint<8> Sx,ap_uint<8> Sy,ap_uint<1> mode,ap_uint<1> relu_en,
dtype_bus feature_in[],ap_uint<4> feature_in_precision,
dtype_bus W[],ap_uint<4> W_precision,
dtype_bus feature_out[],ap_uint<4> feature_out_precision
)//mode: 0:VALID, 1:SAME
{
#pragma HLS INTERFACE s_axilite port=return
#pragma HLS INTERFACE s_axilite port=feature_out_precision
#pragma HLS INTERFACE s_axilite port=feature_in_precision
#pragma HLS INTERFACE s_axilite port=Sy
#pragma HLS INTERFACE s_axilite port=Kx
#pragma HLS INTERFACE s_axilite port=Win
#pragma HLS INTERFACE s_axilite port=Sx
#pragma HLS INTERFACE s_axilite port=Hin
#pragma HLS INTERFACE s_axilite port=W_precision
#pragma HLS INTERFACE s_axilite port=relu_en
#pragma HLS INTERFACE s_axilite port=Ky
#pragma HLS INTERFACE s_axilite port=CHin
#pragma HLS INTERFACE s_axilite port=mode
#pragma HLS INTERFACE s_axilite port=CHout
#pragma HLS INTERFACE m_axi depth=4294967295 port=feature_out offset=slave
#pragma HLS INTERFACE m_axi depth=4294967295 port=W offset=slave
#pragma HLS INTERFACE m_axi depth=4294967295 port=feature_in offset=slave
ap_uint<8> pad_x,pad_y;
ap_uint<16> CHin_div_K=(CHin+K-1)/K;
ap_uint<5> out_truncate;
out_truncate=feature_in_precision+W_precision-feature_out_precision;
if(mode==0)
{
pad_x=0;pad_y=0;
}
else
{
pad_x=(Kx-1)/2;pad_y=(Ky-1)/2;
}
ap_uint<16> Hout,Wout;
Wout=(Win+2*pad_x-Kx)/Sx+1;
Hout=(Hin+2*pad_y-Ky)/Sy+1;
for(int cout=0;cout<CHout;cout=cout+1)
{
#pragma HLS LOOP_TRIPCOUNT min=4 max=4 avg=4
for(int i=0;i<Hout;i++)
{
#pragma HLS LOOP_TRIPCOUNT min=4 max=4 avg=4
for(int j=0;j<Wout;j++)
{
#pragma HLS LOOP_TRIPCOUNT min=4 max=4 avg=4
dtype_acc sum=0;
for(int ii=0;ii<Ky;ii++)
{
#pragma HLS LOOP_TRIPCOUNT min=4 max=4 avg=4
for(int jj=0;jj<Kx;jj++)
{
#pragma HLS LOOP_TRIPCOUNT min=4 max=4 avg=4
ap_int<16> h=i*Sy-pad_y+ii;
ap_int<16> w=j*Sx-pad_x+jj;
{
for(int cin=0;cin<CHin_div_K;cin=cin+1)
{
#pragma HLS PIPELINE
#pragma HLS LOOP_TRIPCOUNT min=4 max=4 avg=4
dtype_mul_bus tp;
dtype_bus dat;
dtype_bus wt;
if(h>=0 && w>=0 && h<Hin && w<Win)
{
dat=feature_in[cin*Hin*Win+h*Win+w];
wt=W[cout*CHin_div_K*Kx*Ky+ii*CHin_div_K*Kx+jj*CHin_div_K+cin];
//std::cout<<"dat="<<dat<<",W="<<wt<<std::endl;
}
else
{
dat=0;
wt=0;
}
for(int k=0;k<K;k++)
{
tp.range(k*32+31,k*32)=(dtype_dat)dat.range(k*16+15,k*16)*(dtype_dat)wt.range(k*16+15,k*16);
}
for(int k=0;k<K;k++)
{
sum+=(dtype_mul)tp.range(k*32+31,k*32);
//std::cout<<"sum="<<sum<<std::endl;
}
//std::cout<<"sum="<<sum<<std::endl;
}
}
}
}
//std::cout<<"sum="<<sum<<std::endl;
if(relu_en & sum<0)
sum=0;
dtype_acc res=sum>>out_truncate;
//std::cout<<"res="<<res<<std::endl;
if(res>32767)
res=32767;
else
if(res<-32768)
res=-32768;
dtype_dat res_16=res;
// std::cout<<"res_16="<<res_16<<std::endl;
// std::cout<<"feature_out["<<(cout/K)*Wout*Hout+i*Wout+j<<"].range("<<(cout%K)*16+15<<","<<(cout%K)*16<<")="<<res<<std::endl;
feature_out[(cout/K)*Wout*Hout+i*Wout+j].range((cout%K)*16+15,(cout%K)*16)=res;
}
}
}
}
头文件:
#ifndef __CONV_CORE_H__
#define __CONV_CORE_H__
#include <ap_int.h>
#include <iostream>
using namespace std;
#define K 8
typedef ap_int<16> dtype_dat;
typedef ap_int<16*K> dtype_bus;
typedef ap_int<32> dtype_mul;
typedef ap_int<32*K> dtype_mul_bus;
typedef ap_int<40> dtype_acc;
void Conv(ap_uint<16> CHin,ap_uint<16> Hin,ap_uint<16> Win,ap_uint<16> CHout,
ap_uint<8> Kx,ap_uint<8> Ky,ap_uint<8> Sx,ap_uint<8> Sy,ap_uint<1> mode,ap_uint<1> relu_en,
dtype_bus feature_in[],ap_uint<4> feature_in_precision,
dtype_bus W[],ap_uint<4> W_precision,
dtype_bus feature_out[],ap_uint<4> feature_out_precision
);//mode: 0:VALID, 1:SAME
#endif
testbench:
#include "stdio.h"
#include "conv.h"
#define IN_WIDTH 10
#define IN_HEIGHT 10
#define IN_CH 1
#define IN_CH_DIV_K ((IN_CH+K-1)/K)
#define KERNEL_WIDTH 5
#define KERNEL_HEIGHT 5
#define X_STRIDE 1
#define Y_STRIDE 1
#define RELU_EN 0
#define MODE 0 //0:VALID, 1:SAME
#define X_PADDING (MODE?(KERNEL_WIDTH-1)/2:0)
#define Y_PADDING (MODE?(KERNEL_HEIGHT-1)/2:0)
#define OUT_CH 1
#define OUT_CH_DIV_K ((OUT_CH+K-1)/K)
#define OUT_WIDTH ((IN_WIDTH+2*X_PADDING-KERNEL_WIDTH)/X_STRIDE+1)
#define OUT_HEIGHT ((IN_HEIGHT+2*Y_PADDING-KERNEL_HEIGHT)/Y_STRIDE+1)
int main(void)
{
dtype_bus feature_in[IN_CH_DIV_K][IN_HEIGHT][IN_WIDTH];
dtype_bus W[OUT_CH][KERNEL_HEIGHT][KERNEL_WIDTH][IN_CH_DIV_K];
dtype_bus feature_out[OUT_CH_DIV_K][OUT_HEIGHT][OUT_WIDTH];
for(int cin=0;cin<IN_CH_DIV_K;cin++)
for(int i=0;i<IN_HEIGHT;i++)
for(int j=0;j<IN_WIDTH;j++)
for(int k=0;k<K;k++)
if((cin*K+k)<IN_CH)
feature_in[cin][i][j].range(16*k+15,16*k)=-(1<<14);//i*IN_WIDTH+j;
else
feature_in[cin][i][j].range(16*k+15,16*k)=0;
for(int i=0;i<KERNEL_HEIGHT;i++)
for(int j=0;j<KERNEL_WIDTH;j++)
for(int cin=0;cin<IN_CH_DIV_K;cin++)
for(int cout=0;cout<OUT_CH;cout++)
for(int k=0;k<K;k++)
W[cout][i][j][cin].range(16*k+15,16*k)=(1<<14);//(i*KERNEL_WIDTH+j);//(cout==0)?(i*KERNEL_WIDTH+j):0;
for(int cout=0;cout<OUT_CH_DIV_K;cout++)
for(int i=0;i<OUT_HEIGHT;i++)
for(int j=0;j<OUT_WIDTH;j++)
feature_out[cout][i][j]=0;
printf("1234\n");
Conv(IN_CH,IN_HEIGHT,IN_WIDTH,OUT_CH,
KERNEL_WIDTH,KERNEL_HEIGHT,X_STRIDE,Y_STRIDE,MODE,RELU_EN,
&feature_in[0][0][0],14,
&W[0][0][0][0],14,
&feature_out[0][0][0],10
);//mode: 0:VALID, 1:SAME
for(int i=0;i<OUT_HEIGHT;i++)
for(int j=0;j<OUT_WIDTH;j++)
for(int cout=0;cout<OUT_CH_DIV_K;cout++)
{
std::cout<<"OUT["<<cout<<"]["<<i<<"]["<<j<<"]="<<(dtype_dat)feature_out[cout][i][j].range(15,0)<<std::endl;
}
return 0;
}
ip核连接
jupyter notebook上的测试:
测试程序:
from pynq import Overlay
import numpy as np
from pynq import Xlnk
import time
import random
K=8
in_width=40
in_height=40
in_channel=1
Kx=5
Ky=5
Sx=2
Sy=2
RELU_EN=0
MODE=0 #0: valid, 1:same
X_PADDING=(Kx-1)//2 if MODE==1 else 0;
Y_PADDING=(Ky-1)//2 if MODE==1 else 0;
out_channel=1
out_width=(in_width+2*X_PADDING-Kx)//Sx+1
out_height=(in_height+2*Y_PADDING-Ky)//Sy+1
xlnk=Xlnk()
ol=Overlay("juanjizhong.bit")
ol.download();
print(ol.ip_dict.keys())
conv=ol.Conv_0
dat_in=xlnk.cma_array(shape=((in_channel+K-1)//K,in_height,in_width,K),cacheable=0,dtype=np.int16)
wt=xlnk.cma_array(shape=(out_channel,Ky,Kx,(in_channel+K-1)//K,K),cacheable=0,dtype=np.int16)
dat_out=xlnk.cma_array(shape=((out_channel+K-1)//K,out_height,out_width,K),cacheable=0,dtype=np.int16)
dat_out_soft=xlnk.cma_array(shape=((out_channel+K-1)//K,out_height,out_width,K),cacheable=0,dtype=np.int16)
for i in range(dat_in.shape[0]):
for j in range(dat_in.shape[1]):
for k in range(dat_in.shape[2]):
for l in range(dat_in.shape[3]):
if(i*K+l<in_channel):
dat_in[i][j][k][l]=random.randint(-1000,1000) #(j*dat_in.shape[2]+k);
for i in range(wt.shape[0]):
for j in range(wt.shape[1]):
for k in range(wt.shape[2]):
for l in range(wt.shape[3]):
for m in range(wt.shape[4]):
wt[i][j][k][l][m]=random.randint(-1000,1000) #j*Kx+k;# if(m==0) else 0;#j*Kx+k;
def Run_Conv(chin,chout,kx,ky,sx,sy,mode,relu_en,feature_in,feature_in_precision,weight,weight_precision,feature_out,feature_out_precision):
conv.write(0x10,chin)
conv.write(0x18,feature_in.shape[1])
conv.write(0x20,feature_in.shape[2])
conv.write(0x28,chout)
conv.write(0x30,kx)
conv.write(0x38,ky)
conv.write(0x40,sx)
conv.write(0x48,sy)
conv.write(0x50,mode)
conv.write(0x58,relu_en)
conv.write(0x60,feature_in.physical_address)
conv.write(0x68,feature_in_precision)
conv.write(0x70,weight.physical_address)
conv.write(0x78,weight_precision)
conv.write(0x80,feature_out.physical_address)
conv.write(0x88,feature_out_precision)
#print("conv ip start")
starttime=time.time()
conv.write(0, (conv.read(0)&0x80)|0x01 ) #start pool IP
#poll the done bit
tp=conv.read(0)
while not((tp>>1)&0x1):
tp=conv.read(0)
#print("conv ip done")
endtime=time.time()
print("Hardware run time=%s s"%(endtime-starttime))
def Run_Conv_Soft(chin,chout,kx,ky,sx,sy,mode,relu_en,feature_in,feature_in_precision,weight,weight_precision,feature_out,feature_out_precision):
if(mode==0):
pad_x=0
pad_y=0
else:
pad_x=(kx-1)//2
pad_y=(ky-1)//2
for i in range(chout):
for j in range(feature_out.shape[1]):
for k in range(feature_out.shape[2]):
sum=np.int64(0)
for c in range(chin):
for ii in range(ky):
for jj in range(kx):
row=j*sy-pad_y+ii
col=k*sx-pad_x+jj
if not (row<0 or col<0 or row>=feature_in.shape[1] or col>=feature_in.shape[2]):
dat=feature_in[c//K][row][col][c%K]
wt=weight[i][ii][jj][c//K][c%K]
#print("%d %d=%d, wt=%d "%(row,col,dat,wt))
sum=sum+int(dat)*int(wt)
res=sum>>(feature_in_precision+weight_precision-feature_out_precision)
if(res>32767):
res=32767
else:
if(res<-32768):
res=32768
feature_out[i//K][j][k][i%K]=res
Run_Conv(in_channel,out_channel,Kx,Ky,Sx,Sy,MODE,RELU_EN,dat_in,5,wt,0,dat_out,0);
starttime=time.time()
Run_Conv_Soft(in_channel,out_channel,Kx,Ky,Sx,Sy,MODE,RELU_EN,dat_in,5,wt,0,dat_out_soft,0);
endtime=time.time()
print("Software run time=%s s"%(endtime-starttime))
flag=1
for i in range(dat_out.shape[0]):
for j in range(dat_out.shape[1]):
for k in range(dat_out.shape[2]):
for l in range(dat_out.shape[3]):
if(dat_out[i][j][k][l]!=dat_out_soft[i][j][k][l]):
flag=0
print("Out_ [%d][%d][%d][%d]=%d"%(i,j,k,l,dat_out[i][j][k][l]));
print("Out_Soft[%d][%d][%d][%d]=%d"%(i,j,k,l,dat_out_soft[i][j][k][l]));
if(flag==1):
print("============================\n result_match\n============================\n");
else:
print("============================\n result_mismatch\n============================\n");
执行结果:
可以看到用硬件加速卷积相比直接在ARM上运算卷积相比,用fpga加速的时间远远短于跑在ARM上的软件卷积程序。
第六天:基于HLS的卷积神经网络软硬件映射第七天:作业答辩与讲评
实现minist数据集的测试:
前期已经训练好minist数据集。现在做的就是要利用硬件来测试结果。
搭建的硬件模块及其连线:
jupyter进行的硬件调试:
1)相关小的ip核配置
from pynq import Overlay
import numpy as np
from pynq import Xlnk
import struct
K=8
def Disp_Feature(feature):
for i in range(np.shape(feature)[0]):
for j in range(np.shape(feature)[1]):
for k in range(np.shape(feature)[2]):
for l in range(np.shape(feature)[3]):
if(feature[i][j][k][l]!=0):
print("out[%d,%d,%d,%d]=%d"%(i,j,k,l,feature[i][j][k][l]));
def Disp_Weight(weight):
for i in range(np.shape(weight)[0]):
for j in range(np.shape(weight)[1]):
for k in range(np.shape(weight)[2]):
for l in range(np.shape(weight)[3]):
for m in range(np.shape(weight)[4]):
print("out[%d,%d,%d,%d,%d]=%d"%(i,j,k,l,m,weight[i][j][k][l][m]));
def Load_Weight_From_File(weight,file):
with open(file,'rb') as fp:
for i in range(np.shape(weight)[0]):
for j in range(np.shape(weight)[1]):
for k in range(np.shape(weight)[2]):
for l in range(np.shape(weight)[3]):
for m in range(np.shape(weight)[4]):
dat=fp.read(2)
a=struct.unpack("h",dat)
#print(a[0])
weight[i][j][k][l][m]=a[0]
def Run_Pool(pool,dma,ch,kx,ky,feature_in,feature_out):
pool.write(0x10,(ch+K-1)//K);
pool.write(0x18,feature_in.shape[1])
pool.write(0x20,feature_in.shape[2])
pool.write(0x28,feature_out.shape[1])
pool.write(0x30,feature_out.shape[2])
pool.write(0x38,kx)
pool.write(0x40,ky)
#print("start");
pool.write(0, (pool.read(0)&0x80)|0x01 ) #start pool IP
dma.recvchannel.transfer(feature_out)
dma.sendchannel.transfer(feature_in)
dma.sendchannel.wait();
#print("send done")
dma.recvchannel.wait()
#print("recv done")
tp=pool.read(0)
while not((tp>>1)&0x1):
tp=pool.read(0)
#print("pool ip done")
def Run_Conv(conv,chin,chout,kx,ky,sx,sy,mode,relu_en,feature_in,feature_in_precision,weight,weight_precision,feature_out,feature_out_precision):
conv.write(0x10,chin)
conv.write(0x18,feature_in.shape[1])
conv.write(0x20,feature_in.shape[2])
conv.write(0x28,chout)
conv.write(0x30,kx)
conv.write(0x38,ky)
conv.write(0x40,sx)
conv.write(0x48,sy)
conv.write(0x50,mode)
conv.write(0x58,relu_en)
conv.write(0x60,feature_in.physical_address)
conv.write(0x68,feature_in_precision)
conv.write(0x70,weight.physical_address)
conv.write(0x78,weight_precision)
conv.write(0x80,feature_out.physical_address)
conv.write(0x88,feature_out_precision)
#print("conv ip start")
conv.write(0, (conv.read(0)&0x80)|0x01 ) #start pool IP
#poll the done bit
tp=conv.read(0)
while not((tp>>1)&0x1):
tp=conv.read(0)
#print("conv ip done")
def Run_Conv_Soft(chin,chout,kx,ky,sx,sy,mode,relu_en,feature_in,feature_in_precision,weight,weight_precision,feature_out,feature_out_precision):
if(mode==0):
pad_x=0
pad_y=0
else:
pad_x=(kx-1)//2
pad_y=(ky-1)//2
for i in range(chout):
for j in range(feature_out.shape[1]):
for k in range(feature_out.shape[2]):
sum=np.int64(0)
for c in range(chin):
for ii in range(ky):
for jj in range(kx):
row=j*sy-pad_y+ii
col=k*sx-pad_x+jj
if not (row<0 or col<0 or row>=feature_in.shape[1] or col>=feature_in.shape[2]):
dat=feature_in[c//K][row][col][c%K]
wt=weight[i][ii][jj][c//K][c%K]
#print("%d %d=%d, wt=%d "%(row,col,dat,wt))
sum=sum+int(dat)*int(wt)
res=sum>>(feature_in_precision+weight_precision-feature_out_precision)
if(res>32767):
res=32767
else:
if(res<-32768):
res=32768
feature_out[i//K][j][k][i%K]=res
2)测试程序