Eyeriss_脉动阵列实现卷积操作(附完整Verilog代码)
一、Eyeriss介绍
《Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks》是一篇由谷歌、麻省理工学院和斯坦福大学的研究人员合作撰写的论文。该论文于2016年在IEEE International Solid-State Circuits Conference (ISSCC)上发表。该论文提出了 Eyeriss,是一种专门为卷积神经网络(CNN)设计的空间架构,旨在实现高效的能耗管理和数据流控制。在深度学习中,CNN 是一种用于图像处理等任务的重要神经网络架构。
Eyeriss架构的特点
Eyeriss 的设计灵感来自于神经科学中对视觉皮层的研究,它采用了一种空间计算的方法来优化卷积操作。该架构具有多个特点和创新之处:
- 空间数据流:Eyeriss 使用了空间数据流的方式来处理卷积操作。它将输入特征映射分割为多个小块,并通过拓扑连接实现并行计算和通信。这种空间数据流的设计使得 Eyeriss 可以高效地执行卷积操作。
- 数据重用:Eyeriss 通过使用本地缓存和共享缓冲区来实现数据重用。它可以在不同的层之间共享数据,减少数据传输和存储开销,提高能耗效率。
- 灵活性:Eyeriss 具有可配置的结构,可以根据不同的卷积神经网络模型进行优化。它可以适应不同大小和类型的卷积核,并灵活地分配计算和存储资源。
- 能耗效率:通过采用空间计算和数据重用的方法,Eyeriss 在相同的面积和功耗下,相比于传统的通用处理器,在执行卷积操作时具有更高的能耗效率。
- 该论文通过在 FPGA 和 ASIC 平台上的实验评估了 Eyeriss 的性能。结果显示,Eyeriss 在处理卷积神经网络任务时能够显著减少能耗,并且具有可接受的计算延迟。
总之,《Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks》介绍了 Eyeriss 架构,它是为了提高卷积神经网络的能耗效率而设计的一种空间架构。这项研究对于深度学习算法在边缘设备和嵌入式系统上的实现具有重要意义。
论文链接:https://ieeexplore.ieee.org/document/7551407
二、Eyeriss实现卷积原理
数据流
如上图所示,设计了3*3的PE阵列,每个PE单元能完成对应的3组数相乘后相加,如a*b+c*d+e*f(Verilog代码见下文),对于这个3*3PE的阵列,卷积核的输入如上图绿色箭头表示,把卷积核的3行按顺序分别输入对应阵列的3行中,图像的输入如上图中蓝色箭头表示,把图像按顺序输入至阵列中,每个PE单元1次能从图像和卷积核分别读3个数进行操作,同时输入至1个PE单元的6个数分别进行相乘后相加,数据跟随时钟在PE阵列中进行游动,且每个时钟把一列的3个PE中的数相加,即得到输出图像的1个像素。
在clk0时,仅有最左边的3个PE得到了filter的row1row3以及ifmap的row1row3。在经过1个clk的MAC后,得到了Ofmap_00的所有psum。相加即可得Ofmap_00。在clk1时,最左边的3个PE在计算Ofmap_01的psum,而第二列的PE也得到了上一个PE使用过传递给其的值,在计算Ofmap_10的psum。同理,在clk2时,可计算出Ofmap_02、Ofmap_11、Ofmap_20。即可粗略作图如下:
三、Verilog实现
PE单元
`timescale 1ns / 1ps
module ConvPE3x3 (
input clk ,
input rst_n ,
input CE ,
input [23:0] IN1 , //imap
input WEIGHT_IN_EN ,
output reg WEIGHT_OUT_EN,
input [23:0] IN2 , //weight
output reg [23:0] NEXT_PE_IN1 ,
output reg [23:0] NEXT_PE_IN2 ,
output reg [17:0] OUT
);
reg [23:0] weight;
// always @(posedge WEIGHT_IN_EN ) begin
// weight <= IN2;
// NEXT_PE_IN2 <= weight;
// WEIGHT_OUT_EN <= 1;
// end
always @(posedge clk) begin
if (WEIGHT_IN_EN) begin
weight <= IN2;
// NEXT_PE_IN2 <= weight;
// WEIGHT_OUT_EN <= 1;
end
end
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
OUT <= 0;
WEIGHT_OUT_EN <= 0;
end else begin
if (CE) begin
OUT <= IN1[23:16] * weight[23:16] + IN1[15:8] * weight[15:8] + IN1[7:0] * weight[7:0];
NEXT_PE_IN1 <= IN1[23:0];
NEXT_PE_IN2 <= IN2;
WEIGHT_OUT_EN <= 1;
end else begin
OUT <= 0;
end
end
end
endmodule
顶层文件
`timescale 1ns / 1ps
module eyeriss (
input clk ,
input rst_n ,
input CE ,
input [ 3*3*8-1:0] filter,
input [64*48*8-1:0] image , //row1: [64*48*8-1 -: 48*8] row2:[63*48*8-1 -: 48*8]
output reg [ 17:0] o_1, o_2, o_3
);
// reg [48*8-1:0] image_row [63:0];
reg [48*8-1:0] image_row [63:0];
reg [ 23:0] filter_row[ 2:0];
// reg [ 23:0] filter_row[ 0:2];
generate
genvar i_0;
for (i_0 = 0; i_0 < 64; i_0 = i_0 + 1'b1)
begin : g_row_reg
always @(posedge clk or negedge rst_n)
begin
if (!rst_n)
begin
image_row[i_0] <= 0;
end
else
begin
image_row[i_0] <= image[(64-i_0)*48*8-1-:48*8];
end
end
end
endgenerate
generate
genvar i_1;
for (i_1 = 0; i_1 < 3; i_1 = i_1 + 1'b1)
begin : filter_row_reg
always @(posedge clk or negedge rst_n)
begin
if (!rst_n)
begin
filter_row[i_1] <= 0;
end
else
begin
filter_row[i_1] <= filter[(3-i_1)*3*8-1-:3*8];
end
end
end
endgenerate
reg [6:0] cnt;
reg [ 6:0] flag, flag_d1;
reg [23:0] row_0_data, row_1_data, row_2_data, row_3_data, row_3_data_d1, row_4_data, row_4_data_d1, row_4_data_d2;
always @(posedge clk or negedge rst_n)
begin
if (!rst_n || cnt == 0)
begin
row_0_data <= 24'b0000_000;
row_1_data <= 24'b0000_000;
row_2_data <= 24'b0000_000;
row_3_data <= 24'b0000_000;
row_4_data <= 24'b0000_000;
end
else if (cnt <= 46)
begin
case (flag)
0 :
begin
row_0_data <= image_row[0][(49-cnt)*8-1-:24];
row_1_data <= image_row[1][(49-cnt)*8-1-:24];
row_2_data <= image_row[2][(49-cnt)*8-1-:24];
row_3_data <= image_row[3][(49-cnt)*8-1-:24];
row_4_data <= image_row[4][(49-cnt)*8-1-:24];
end
1 :
begin
row_0_data <= image_row[3][(49-cnt)*8-1-:24];
row_1_data <= image_row[4][(49-cnt)*8-1-:24];
row_2_data <= image_row[5][(49-cnt)*8-1-:24];
row_3_data <= image_row[6][(49-cnt)*8-1-:24];
row_4_data <= image_row[7][(49-cnt)*8-1-:24];
end
2 :
begin
row_0_data <= image_row[6][(49-cnt)*8-1-:24];
row_1_data <= image_row[7][(49-cnt)*8-1-:24];
row_2_data <= image_row[8][(49-cnt)*8-1-:24];
row_3_data <= image_row[9][(49-cnt)*8-1-:24];
row_4_data <= image_row[10][(49-cnt)*8-1-:24];
end
3 :
begin
row_0_data <= image_row[9][(49-cnt)*8-1-:24];
row_1_data <= image_row[10][(49-cnt)*8-1-:24];
row_2_data <= image_row[11][(49-cnt)*8-1-:24];
row_3_data <= image_row[12][(49-cnt)*8-1-:24];
row_4_data <= image_row[13][(49-cnt)*8-1-:24];
end
4 :
begin
row_0_data <= image_row[12][(49-cnt)*8-1-:24];
row_1_data <= image_row[13][(49-cnt)*8-1-:24];
row_2_data <= image_row[14][(49-cnt)*8-1-:24];
row_3_data <= image_row[15][(49-cnt)*8-1-:24];
row_4_data <= image_row[16][(49-cnt)*8-1-:24];
end
5 :
begin
row_0_data <= image_row[15][(49-cnt)*8-1-:24];
row_1_data <= image_row[16][(49-cnt)*8-1-:24];
row_2_data <= image_row[17][(49-cnt)*8-1-:24];
row_3_data <= image_row[18][(49-cnt)*8-1-:24];
row_4_data <= image_row[19][(49-cnt)*8-1-:24];
end
6 :
begin
row_0_data <= image_row[18][(49-cnt)*8-1-:24];
row_1_data <= image_row[19][(49-cnt)*8-1-:24];
row_2_data <= image_row[20][(49-cnt)*8-1-:24];
row_3_data <= image_row[21][(49-cnt)*8-1-:24];
row_4_data <= image_row[22][(49-cnt)*8-1-:24];
end
7 :
begin
row_0_data <= image_row[21][(49-cnt)*8-1-:24];
row_1_data <= image_row[22][(49-cnt)*8-1-:24];
row_2_data <= image_row[23][(49-cnt)*8-1-:24];
row_3_data <= image_row[24][(49-cnt)*8-1-:24];
row_4_data <= image_row[25][(49-cnt)*8-1-:24];
end
8 :
begin
row_0_data <= image_row[24][(49-cnt)*8-1-:24];
row_1_data <= image_row[25][(49-cnt)*8-1-:24];
row_2_data <= image_row[26][(49-cnt)*8-1-:24];
row_3_data <= image_row[27][(49-cnt)*8-1-:24];
row_4_data <= image_row[28][(49-cnt)*8-1-:24];
end
9 :
begin
row_0_data <= image_row[27][(49-cnt)*8-1-:24];
row_1_data <= image_row[28][(49-cnt)*8-1-:24];
row_2_data <= image_row[29][(49-cnt)*8-1-:24];
row_3_data <= image_row[30][(49-cnt)*8-1-:24];
row_4_data <= image_row[31][(49-cnt)*8-1-:24];
end
10 :
begin
row_0_data <= image_row[30][(49-cnt)*8-1-:24];
row_1_data <= image_row[31][(49-cnt)*8-1-:24];
row_2_data <= image_row[32][(49-cnt)*8-1-:24];
row_3_data <= image_row[33][(49-cnt)*8-1-:24];
row_4_data <= image_row[34][(49-cnt)*8-1-:24];
end
11 :
begin
row_0_data <= image_row[33][(49-cnt)*8-1-:24];
row_1_data <= image_row[34][(49-cnt)*8-1-:24];
row_2_data <= image_row[35][(49-cnt)*8-1-:24];
row_3_data <= image_row[36][(49-cnt)*8-1-:24];
row_4_data <= image_row[37][(49-cnt)*8-1-:24];
end
12 :
begin
row_0_data <= image_row[36][(49-cnt)*8-1-:24];
row_1_data <= image_row[37][(49-cnt)*8-1-:24];
row_2_data <= image_row[38][(49-cnt)*8-1-:24];
row_3_data <= image_row[39][(49-cnt)*8-1-:24];
row_4_data <= image_row[40][(49-cnt)*8-1-:24];
end
13 :
begin
row_0_data <= image_row[39][(49-cnt)*8-1-:24];
row_1_data <= image_row[40][(49-cnt)*8-1-:24];
row_2_data <= image_row[41][(49-cnt)*8-1-:24];
row_3_data <= image_row[42][(49-cnt)*8-1-:24];
row_4_data <= image_row[43][(49-cnt)*8-1-:24];
end
14 :
begin
row_0_data <= image_row[42][(49-cnt)*8-1-:24];
row_1_data <= image_row[43][(49-cnt)*8-1-:24];
row_2_data <= image_row[44][(49-cnt)*8-1-:24];
row_3_data <= image_row[45][(49-cnt)*8-1-:24];
row_4_data <= image_row[46][(49-cnt)*8-1-:24];
end
15 :
begin
row_0_data <= image_row[45][(49-cnt)*8-1-:24];
row_1_data <= image_row[46][(49-cnt)*8-1-:24];
row_2_data <= image_row[47][(49-cnt)*8-1-:24];
row_3_data <= image_row[48][(49-cnt)*8-1-:24];
row_4_data <= image_row[49][(49-cnt)*8-1-:24];
end
16 :
begin
row_0_data <= image_row[48][(49-cnt)*8-1-:24];
row_1_data <= image_row[49][(49-cnt)*8-1-:24];
row_2_data <= image_row[50][(49-cnt)*8-1-:24];
row_3_data <= image_row[51][(49-cnt)*8-1-:24];
row_4_data <= image_row[52][(49-cnt)*8-1-:24];
end
17 :
begin
row_0_data <= image_row[51][(49-cnt)*8-1-:24];
row_1_data <= image_row[52][(49-cnt)*8-1-:24];
row_2_data <= image_row[53][(49-cnt)*8-1-:24];
row_3_data <= image_row[54][(49-cnt)*8-1-:24];
row_4_data <= image_row[55][(49-cnt)*8-1-:24];
end
18 :
begin
row_0_data <= image_row[54][(49-cnt)*8-1-:24];
row_1_data <= image_row[55][(49-cnt)*8-1-:24];
row_2_data <= image_row[56][(49-cnt)*8-1-:24];
row_3_data <= image_row[57][(49-cnt)*8-1-:24];
row_4_data <= image_row[58][(49-cnt)*8-1-:24];
end
19 :
begin
row_0_data <= image_row[57][(49-cnt)*8-1-:24];
row_1_data <= image_row[58][(49-cnt)*8-1-:24];
row_2_data <= image_row[59][(49-cnt)*8-1-:24];
row_3_data <= image_row[60][(49-cnt)*8-1-:24];
row_4_data <= image_row[61][(49-cnt)*8-1-:24];
end
20 :
begin
row_0_data <= image_row[60][(49-cnt)*8-1-:24];
row_1_data <= image_row[61][(49-cnt)*8-1-:24];
row_2_data <= image_row[62][(49-cnt)*8-1-:24];
row_3_data <= image_row[63][(49-cnt)*8-1-:24];
row_4_data <= 0;
end
endcase
end
end
always @(posedge clk)
begin
row_3_data_d1 <= row_3_data;
row_4_data_d1 <= row_4_data;
row_4_data_d2 <= row_4_data_d1;
end
always @(posedge clk or negedge rst_n)
begin
if (!rst_n)
begin
cnt <= 0;
flag <= 0;
end
else if (cnt == 46)
begin
cnt <= 1;
flag <= flag + 1;
end
else
cnt <= cnt + 1;
end
wire [17:0] OUT_1_1, OUT_1_2, OUT_1_3, OUT_2_1, OUT_2_2, OUT_2_3, OUT_3_1, OUT_3_2, OUT_3_3;
wire [23:0] IN1_1_1, IN1_1_2, IN1_1_3, IN1_2_1, IN1_2_2, IN1_2_3, IN1_3_1, IN1_3_2, IN1_3_3;
wire [23:0] IN2_1_1, IN2_1_2, IN2_1_3, IN2_2_1, IN2_2_2, IN2_2_3, IN2_3_1, IN2_3_2, IN2_3_3;
wire WEIGHT_EN_1_2, WEIGHT_EN_1_3, WEIGHT_EN_2_2, WEIGHT_EN_2_3, WEIGHT_EN_3_2, WEIGHT_EN_3_3;
reg WEIGHT_IN_EN ;
assign IN1_1_1 = CE ? row_0_data : 0;
assign IN1_2_1 = CE ? row_1_data : 0;
assign IN1_3_1 = CE ? row_2_data : 0;
assign IN1_3_2 = CE ? row_3_data_d1 : 0;
assign IN1_3_3 = CE ? row_4_data_d2 : 0;
always @(posedge clk)
begin
if (CE)
begin
WEIGHT_IN_EN <= 1;
end
end
ConvPE3x3 ConvPE3x3_1_1 (
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.IN1 (IN1_1_1 ),
.IN2 (IN2_1_1 ),
.WEIGHT_IN_EN (WEIGHT_IN_EN ),
.WEIGHT_OUT_EN(WEIGHT_EN_1_2),
.NEXT_PE_IN1 ( ),
.NEXT_PE_IN2 (IN2_1_2 ),
.OUT (OUT_1_1 )
);
ConvPE3x3 ConvPE3x3_1_2 (
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.IN1 (IN1_1_2 ),
.IN2 (IN2_1_2 ),
.WEIGHT_IN_EN (WEIGHT_EN_1_2),
.WEIGHT_OUT_EN(WEIGHT_EN_1_3),
.NEXT_PE_IN1 ( ),
.NEXT_PE_IN2 (IN2_1_3 ),
.OUT (OUT_1_2 )
);
ConvPE3x3 ConvPE3x3_1_3 (
.clk (clk ),
.rst_n (rst_n ),
.CE (WEIGHT_IN_EN ),
.IN1 (IN1_1_3 ),
.IN2 (IN2_1_3 ),
.WEIGHT_IN_EN (WEIGHT_EN_1_3),
.WEIGHT_OUT_EN( ),
.NEXT_PE_IN1 ( ),
.NEXT_PE_IN2 ( ),
.OUT (OUT_1_3 )
);
ConvPE3x3 ConvPE3x3_2_1 (
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.IN1 (IN1_2_1 ),
.IN2 (IN2_2_1 ),
.WEIGHT_IN_EN (CE ),
.WEIGHT_OUT_EN(WEIGHT_EN_2_2),
.NEXT_PE_IN1 (IN1_1_2 ),
.NEXT_PE_IN2 (IN2_2_2 ),
.OUT (OUT_2_1 )
);
ConvPE3x3 ConvPE3x3_2_2 (
.clk (clk ),
.rst_n (rst_n ),
.CE (WEIGHT_IN_EN ),
.IN1 (IN1_2_2 ),
.IN2 (IN2_2_2 ),
.WEIGHT_IN_EN (WEIGHT_EN_2_2),
.WEIGHT_OUT_EN(WEIGHT_EN_2_3),
.NEXT_PE_IN1 (IN1_1_3 ),
.NEXT_PE_IN2 (IN2_2_3 ),
.OUT (OUT_2_2 )
);
ConvPE3x3 ConvPE3x3_2_3 (
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.IN1 (IN1_2_3 ),
.IN2 (IN2_2_3 ),
.WEIGHT_IN_EN (WEIGHT_EN_2_3),
.WEIGHT_OUT_EN( ),
.NEXT_PE_IN1 ( ),
.NEXT_PE_IN2 ( ),
.OUT (OUT_2_3 )
);
ConvPE3x3 ConvPE3x3_3_1 (
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.IN1 (IN1_3_1 ),
.IN2 (IN2_3_1 ),
.WEIGHT_IN_EN (CE ),
.WEIGHT_OUT_EN(WEIGHT_EN_3_2),
.NEXT_PE_IN1 (IN1_2_2 ),
.NEXT_PE_IN2 (IN2_3_2 ),
.OUT (OUT_3_1 )
);
ConvPE3x3 ConvPE3x3_3_2 (
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.IN1 (IN1_3_2 ),
.IN2 (IN2_3_2 ),
.WEIGHT_IN_EN (WEIGHT_EN_3_2),
.WEIGHT_OUT_EN(WEIGHT_EN_3_3),
.NEXT_PE_IN1 (IN1_2_3 ),
.NEXT_PE_IN2 (IN2_3_3 ),
.OUT (OUT_3_2 )
);
ConvPE3x3 ConvPE3x3_3_3 (
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.IN1 (IN1_3_3 ),
.IN2 (IN2_3_3 ),
.WEIGHT_IN_EN (WEIGHT_EN_3_3),
.WEIGHT_OUT_EN( ),
.NEXT_PE_IN1 ( ),
.NEXT_PE_IN2 ( ),
.OUT (OUT_3_3 )
);
assign IN2_1_1 = CE ? filter[3*3*8-1 : 2*3*8] : 0;
assign IN2_2_1 = CE ? filter[2*3*8-1 : 1*3*8] : 0;
assign IN2_3_1 = CE ? filter[1*3*8-1 : 0*3*8] : 0;
// reg [19:0] o_1, o_2, o_3;
always @(posedge clk or negedge rst_n)
begin
if (!rst_n)
begin
o_1 <= 0;
o_2 <= 0;
o_3 <= 0;
end
else
begin
if (CE)
begin
o_1 <= OUT_1_1 + OUT_2_1 + OUT_3_1;
o_2 <= OUT_1_2 + OUT_2_2 + OUT_3_2;
o_3 <= OUT_1_3 + OUT_2_3 + OUT_3_3;
end
else
begin
o_1 <= 0;
o_2 <= 0;
o_3 <= 0;
end
end
end
endmodule
Testbench文件
`timescale 1ns / 1ps
module eyeriss_tb ();
reg clk ;
reg rst_n;
reg CE ;
initial
begin
clk = 0;
rst_n = 0;
forever
#5 clk = ~clk;
end
initial
begin
#16 rst_n = 1;
#5 CE = 1;
end
reg [ 3*3*8-1:0] filter[0:0];
reg [64*48*8-1:0] image [0:0];
wire [ 17:0] o_1, o_2, o_3;
initial
begin
$readmemb("./filter.txt", filter);
$readmemb("./image.txt", image);
end
initial begin
// $dumpfile("wave.fsdb");
// $dumpvars(0, eyeriss_tb );
$dumpfile("eyeriss.vcd");
$dumpvars(0, eyeriss_tb);
#10000 $finish;
end
wire [62*46*8-1:0] result;
eyeriss u_eyeriss (
//ports
.clk (clk ),
.rst_n (rst_n ),
.CE (CE ),
.filter(filter[0]),
.image (image[0] ),
.o_1 (o_1 ),
.o_2 (o_2 ),
.o_3 (o_3 )
);
endmodule
仿真结果
仿真波形图
python结果
二者结果一致,且时序符合构想,仿真结果正确,整体工程已上传个人github仓库https://github.com/Qglddd111/eyeriss/tree/main/eyeriss_qgl。