卷积核 kernel_size=3*3
输入特征图fmap[width,high]=[9,9]
Verilog HDL
Xilinx VIVADO
源文件
`timescale 1ns / 1ps
module conv_pe(
input clk,
input rst,
input input_en,//输出使能
input [7:0] kernel_00, //卷积核窗口3*3
input [7:0] kernel_01,
input [7:0] kernel_02,
input [7:0] kernel_10,
input [7:0] kernel_11,
input [7:0] kernel_12,
input [7:0] kernel_20,
input [7:0] kernel_21,
input [7:0] kernel_22,
input [7:0] fmap_00, //输入特征图的被卷积核窗口3*3
input [7:0] fmap_01,
input [7:0] fmap_02,
input [7:0] fmap_10,
input [7:0] fmap_11,
input [7:0] fmap_12,
input [7:0] fmap_20,
input [7:0] fmap_21,
input [7:0] fmap_22,
output valid_out, //输出使能
output [31:0] sum_data//卷积输出和
);
reg input_en_1,input_en_2,input_en_3,input_en_4;//卷积操作使能
always @(posedge clk) begin
if(rst) begin//未开始卷积操作
input_en_1<= 0;
input_en_2<= 0;
input_en_3<= 0;
input_en_4<= 0;
end else begin//依次卷积操作使能
input_en_1<=input_en ;
input_en_2<=input_en_1 ;
input_en_3<=input_en_2 ;
input_en_4<=input_en_3 ;
end
end
reg valid_out_r;//卷积输出使能寄存器
reg [31:0] sum_data_r;//卷积输出和寄存器
wire [16:0]dsp_00,dsp_01,dsp_02,dsp_10,dsp_11,dsp_12,dsp_20,dsp_21,dsp_22;
//DSP乘法累加器,进行进行卷积核与被卷积窗口对应两位INT8的乘法运算
always @(posedge clk ) begin
if(rst) begin//卷积未开始
valid_out_r<= 0;
sum_data_r<=32'd0;
end else begin
if(input_en_4) begin//卷积结束
sum_data_r<={{15{dsp_00[16]}},dsp_00}+{{15{dsp_01[16]}},dsp_01}+{{15{dsp_02[16]}},dsp_02}+
{{15{dsp_10[16]}},dsp_10}+{{15{dsp_11[16]}},dsp_11}+{{15{dsp_12[16]}},dsp_12}+
{{15{dsp_20[16]}},dsp_20}+{{15{dsp_21[16]}},dsp_21}+{{15{dsp_22[16]}},dsp_22};//将所有卷积结果相加赋予卷积输出和寄存器
valid_out_r<=1;//输出使能寄存器赋1
end
else begin
valid_out_r<=0;//输出使能寄存器赋0
end
end
end
assign valid_out=valid_out_r;//将输出使能寄存器值赋予输出使能
assign sum_data=sum_data_r;//将卷积输出和寄存器值赋予卷积输出
dsp48_macro_0 uut_dsp48_1(//UUT被测单元第一个DSP运算参数
.CLK(clk),
.A(kernel_00), //[7:0]
.B(fmap_00), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_00) //[16:0]
);
dsp48_macro_0 uut_dsp48_2(
.CLK(clk),
.A(kernel_01), //[7:0]
.B(fmap_01), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_01) //[16:0]
);
dsp48_macro_0 uut_dsp48_3(
.CLK(clk),
.A(kernel_02), //[7:0]
.B(fmap_02), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_02) //[16:0]
);
dsp48_macro_0 uut_dsp48_4(
.CLK(clk),
.A(kernel_10), //[7:0]
.B(fmap_10), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_10) //[16:0]
);
dsp48_macro_0 uut_dsp48_5(
.CLK(clk),
.A(kernel_11), //[7:0]
.B(fmap_11), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_11) //[16:0]
);
dsp48_macro_0 uut_dsp48_6(
.CLK(clk),
.A(kernel_12), //[7:0]
.B(fmap_12), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_12) //[16:0]
);
dsp48_macro_0 uut_dsp48_7(
.CLK(clk),
.A(kernel_20), //[7:0]
.B(fmap_20), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_20) //[16:0]
);
dsp48_macro_0 uut_dsp48_8(
.CLK(clk),
.A(kernel_21), //[7:0]
.B(fmap_21), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_21) //[16:0]
);
dsp48_macro_0 uut_dsp48_9(
.CLK(clk),
.A(kernel_22), //[7:0]
.B(fmap_22), //[7:0]
.C(2'd0),//[1:0]
.P(dsp_22) //[16:0]
);
reg [31:0]conv_pe_count;//存储器前为存储单元储存位宽32位,后为存储器的大小即多少个这样的存储器。
always @(posedge clk) begin : proc_
if(rst) begin
conv_pe_count<= 0;
end else begin
if(valid_out)begin//若输出使能
conv_pe_count<=conv_pe_count+1;
end
end
end
endmodule
仿真文件
`timescale 1ns / 1ps
module tb_conv_pe;
reg clk;
reg rst;
initial begin//initial块仅仅在仿真开始时执行一次
rst=1;
#1000//延迟1000个单位时间
rst=0;
end
always begin:clk1_blk
clk=0;
forever#5 clk=~clk;//循环产生周期信号,周期10个时间单位
end
reg[7:0]kernel_00,kernel_01,kernel_02,kernel_10,kernel_11,kernel_12,kernel_20,kernel_21,kernel_22;//卷积核3*3
reg[7:0]fmap_00,fmap_01,fmap_02,fmap_10,fmap_11,fmap_12,fmap_20,fmap_21,fmap_22;//对应被卷积特征窗口3*3
reg input_en;//输入使能
wire valid_out;//输出使能
wire[31:0]sum_data;//卷积结果数据
conv_pe uut_conv_pe(//被测单元UUT
.clk(clk),
.rst(rst),
.input_en(input_en),
.kernel_00(kernel_00),//卷积核3*3
.kernel_01(kernel_01),
.kernel_02(kernel_02),
.kernel_10(kernel_10),
.kernel_11(kernel_11),
.kernel_12(kernel_12),
.kernel_20(kernel_20),
.kernel_21(kernel_21),
.kernel_22(kernel_22),
.fmap_00(fmap_00),//对应被卷积特征窗口3*3
.fmap_01(fmap_01),
.fmap_02(fmap_02),
.fmap_10(fmap_10),
.fmap_11(fmap_11),
.fmap_12(fmap_12),
.fmap_20(fmap_20),
.fmap_21(fmap_21),
.fmap_22(fmap_22),
.valid_out(valid_out),
.sum_data(sum_data)
);
reg[7:0]fmap[80:0];//9*9的输入特征矩阵
initial begin
$readmemh("F://vivado_project//conv_pe//fmap.txt",fmap);//读取输入特征数据
end
integer i,j;//卷积窗口右移与下移
reg[2:0]states;
always@(posedge clk)begin
if(rst)begin
i<=0;
j<=0;
states<=0;
input_en<=0;
fmap_00<=8'd0;//输入特征被卷积窗口大小初始
fmap_01<=8'd0;
fmap_02<=8'd0;
fmap_10<=8'd0;
fmap_11<=8'd0;
fmap_12<=8'd0;
fmap_20<=8'd0;
fmap_21<=8'd0;
fmap_22<=8'd0;
kernel_00<=8'd1;//卷积窗口核数值
kernel_01<=8'd0;
kernel_02<=-8'd1;
kernel_10<=8'd2;
kernel_11<=8'd0;
kernel_12<=-8'd2;
kernel_20<=8'd1;
kernel_21<=8'd0;
kernel_22<=-8'd1;
end
else begin
case(states)
0:begin
if(i<7)begin//卷积输出width=fmap_width-kernel_size+1=9-3+1=7。0-6
input_en<=1;//输入使能
fmap_00<=fmap[i+0+9*j];//i=0,j=0:第一行前三个fmap[0]、fmap[1]、fmap[2]
fmap_01<=fmap[i+1+9*j];
fmap_02<=fmap[i+2+9*j];
fmap_10<=fmap[i+0+9*(j+1)];//i=0,j=0:第二行前三个fmap[9]、fmap[10]、fmap[11]
fmap_11<=fmap[i+1+9*(j+1)];
fmap_12<=fmap[i+2+9*(j+1)];
fmap_20<=fmap[i+0+9*(j+2)];//i=0,j=0:第三行前三个fmap[18]、fmap[19]、fmap[20]
fmap_21<=fmap[i+1+9*(j+2)];
fmap_22<=fmap[i+2+9*(j+2)];
i<=i+1;//i依次等于0-6,j=0,卷积窗口右移。
states<=1;
end
else begin
input_en<=0;
end
end
1:begin
input_en<=0;
if(i==7)begin//第一轮卷积结束
i<=0;
j<=j+1;/下一轮,/从下一行开始窗口右移卷积
end
if(j<7)begin//卷积输出high=fmap_high-kernel_size+1=9-3+1=7。0-6
states<=0;
end
if(j==6&&i==7)begin//卷积窗口遍历结束
states<=2;
end
end
2:begin
input_en<=0;
fmap_00<=8'd0;//卷积结束,初始化
fmap_01<=8'd0;
fmap_02<=8'd0;
fmap_10<=8'd0;
fmap_11<=8'd0;
fmap_12<=8'd0;
fmap_20<=8'd0;
fmap_21<=8'd0;
fmap_22<=8'd0;
end
endcase
end
end
integer end_temp;//整型
integer count;
initial begin
end_temp=$fopen("F://vivado_project//conv_pe//result.txt","w");//打开输出结果文件,写操作
end
always@(posedge clk)begin
if(rst)begin
count<=0;
end
if(uut_conv_pe.valid_out)begin//uut模块中输出使能为1,写入卷积结果数据
$fwrite(end_temp,"%d",$signed(uut_conv_pe.sum_data));
count<=count+1;
if(count==6)begin//一行写入7个数据后,重新计数,换行写入
count<=0;
$fwrite(end_temp,"\n");
end
end
end
endmodule
注:代码非本人编写,我只是增加了自己的注释理解,源码并非来自网络,未加密可分享的内容自然会分享,不必询问是否可以分享源码哦~。