for(int fo = 0; fo < Nfo; ++fo)
for(int fi = 0; fi < Nfi; ++fi)
for(int kx = 0; kx < Kx; ++kx)
clr_XYZTMACC(); //乘累加寄存器清零
for(int ky = 0; ky < Ky; ky += 2) {
for(int x = 0; x < H; ++x)
for(int y = 0; y < W; y += 16) {
//读取16个out_bp的数据至寄存器
//读取16*2 = 32 个input的数据至寄存器(分两个时钟周期读取)
//因为双字读取时,两个字是相邻的,即两个相邻的字对应一个out_bp的字
XYZTfetch_from_16_conv_window_to_register();
//16个MACC同时计算
XYZTMACC0 += R0 * R4 || XYZTMACC1 += R1 * R4
|| XYZTMACC2 += R2 * R5 || XYZTMACC3 += R3 * R5
}
//end of computing 1 matrix of out_bp
//----------------------reduce and sum操作--------------------------
XYZTCopy_MACC_to_register(); //拷贝到寄存器
//每个执行宏内部先进行sum操作
XYZTR0 = R0 + R2 || XYZTR1 = R1 + R3
//将Macro Y/Z/T内的R0和R1的值传到Marco X
gatherToMacroX_R2_7();
//Macro X 内部进行reduce_sum
for(int i=2; i<7;i+=2) XR0 += Ri || XR1 += Ri+1;
//将2个输出结果从寄存器写入内存
df(fo,fi,kx,ky) = XR0;
|| df(fo,fi,kx,ky+1) = XR1;
}