应用于FPGA和ASIC的单精度浮点乘法器
本篇文章主要基于开源的pulp-platform平台中的FPU单元进行修改,并完成仿真设计。pulp-platform中FPU的开源网址为:https://github.com/pulp-platform/fpu
使用的文件为fpu_v0.1文件夹中的fpu_mult.sv,fpu_norm.sv和fpu_utils文件夹中的fpu_ff.sv文件,修改fpu_mult.sv文件如下,主要将其参数定义进行修改:
module fpu_mult
#(
parameter C_MANT_PRENORM = 48,
parameter C_EXP_PRENORM = 10,
parameter C_EXP = 8,
parameter C_MANT = 23,
parameter C_BIAS = 127
)
(//Input
input logic Sign_a_DI,
input logic Sign_b_DI,
input logic [C_EXP-1:0] Exp_a_DI,
input logic [C_EXP-1:0] Exp_b_DI,
input logic [C_MANT:0] Mant_a_DI ,
input logic [C_MANT:0] Mant_b_DI,
//Output
output logic Sign_prenorm_DO,
output logic signed [C_EXP_PRENORM-1 :0] Exp_prenorm_DO,
output logic [C_MANT_PRENORM-1:0] Mant_prenorm_DO
);
(* use_dsp="yes" *)
//Operand components
logic Sign_a_D;
logic Sign_b_D;
logic Sign_prenorm_D;
logic [C_EXP-1:0] Exp_a_D;
logic [C_EXP-1:0] Exp_b_D;
logic [C_MANT:0] Mant_a_D;
logic [C_MANT:0] Mant_b_D;
//Exponent calculations
logic signed [C_EXP_PRENORM-1:0] Exp_prenorm_D; //signed exponent for normalizer
//Multiplication
logic [C_MANT_PRENORM-1:0] Mant_prenorm_D;
/
// Assign Inputs //
/
assign Sign_a_D = Sign_a_DI;
assign Sign_b_D = Sign_b_DI;
assign Exp_a_D = Exp_a_DI;
assign Exp_b_D = Exp_b_DI;
assign Mant_a_D = Mant_a_DI;
assign Mant_b_D = Mant_b_DI;
/
// Output calculations //
/
assign Sign_prenorm_D = Sign_a_D ^ Sign_b_D;
assign Exp_prenorm_D = signed'({2'b0,Exp_a_D}) + signed'({2'b0,Exp_b_D}) - signed'(C_BIAS);
assign Mant_prenorm_D = Mant_a_D * Mant_b_D;
/
// Output assignments
/
assign Sign_prenorm_DO = Sign_prenorm_D;
assign Exp_prenorm_DO = Exp_prenorm_D;
assign Mant_prenorm_DO = Mant_prenorm_D;
endmodule //fpu_mult
单精度浮点乘法器的符号位位宽为1,指数位位宽为8,尾数位位宽为23,C_BIAS为指数计算时需要减去的固定值,默认为127。fpu_mul文件的输出符号位位宽为1,指数位位宽为10,尾数位位宽为48,因此需要使用fpu_norm.sv文件将结果转换为正常的浮点数表示形式。
fpu_norm.sv文件代码如下:
module fpu_norm
#(
parameter C_MANT_PRENORM = 48,
parameter C_EXP_PRENORM = 25,
parameter C_MANT_PRENORM_IND = 6,
parameter C_EXP_ZERO = 8'h00,
parameter C_EXP_INF = 8'hff,
parameter C_RM = 3,
parameter C_CMD = 4,
parameter C_MANT = 23,
parameter C_EXP = 8,
parameter C_FPU_ADD_CMD = 0,
parameter C_FPU_SUB_CMD = 1,
parameter C_FPU_MUL_CMD = 2,
parameter C_RM_NEAREST = 0,
parameter C_RM_TRUNC = 1,
parameter C_RM_PLUSINF = 3,
parameter C_RM_MINUSINF = 2
)
(
//Input Operands
input logic [C_MANT_PRENORM-1:0] Mant_in_DI,
input logic signed [C_EXP_PRENORM-1:0] Exp_in_DI,
input logic Sign_in_DI,
//Rounding Mode
input logic [C_RM-1:0] RM_SI,
input logic [C_CMD-1:0] OP_SI,
output logic [C_MANT:0] Mant_res_DO,
output logic [C_EXP-1:0] Exp_res_DO,
output logic Rounded_SO,
output logic Exp_OF_SO,
output logic Exp_UF_SO
);
/
// Normalization //
/
logic [C_MANT_PRENORM_IND-1:0] Mant_leadingOne_D;
logic Mant_zero_S;
logic [C_MANT+4:0] Mant_norm_D;
logic signed [C_EXP_PRENORM-1:0] Exp_norm_D;
//trying out stuff for denormals
logic signed [C_EXP_PRENORM-1:0] Mant_shAmt_D;
logic signed [C_EXP_PRENORM:0] Mant_shAmt2_D;
logic [C_EXP-1:0] Exp_final_D;
logic signed [C_EXP_PRENORM-1:0] Exp_rounded_D;
//sticky bit
logic Mant_sticky_D;
logic Denormal_S;
logic Mant_renorm_S;
//Detect leading one
fpu_ff
#(
.LEN(C_MANT_PRENORM))
LOD
(
.in_i ( Mant_in_DI ),
.first_one_o ( Mant_leadingOne_D ),
.no_ones_o ( Mant_zero_S )
);
logic Denormals_shift_add_D;
logic Denormals_exp_add_D;
assign Denormals_shift_add_D = ~Mant_zero_S & (Exp_in_DI == C_EXP_ZERO) & ((OP_SI != C_FPU_MUL_CMD) | (~Mant_in_DI[C_MANT_PRENORM-1] & ~Mant_in_DI[C_MANT_PRENORM-2]));
assign Denormals_exp_add_D = Mant_in_DI[C_MANT_PRENORM-2] & (Exp_in_DI == C_EXP_ZERO) & ((OP_SI == C_FPU_ADD_CMD) | (OP_SI == C_FPU_SUB_CMD ));
assign Denormal_S = ((C_EXP_PRENORM)'(signed'(Mant_leadingOne_D)) >= Exp_in_DI) || Mant_zero_S;
assign Mant_shAmt_D = Denormal_S ? Exp_in_DI + Denormals_shift_add_D : Mant_leadingOne_D;
assign Mant_shAmt2_D = {Mant_shAmt_D[$high(Mant_shAmt_D)], Mant_shAmt_D} + (C_MANT+4+1);
//Shift mantissa
always_comb
begin
logic [C_MANT_PRENORM+C_MANT+4:0] temp;
temp = ((C_MANT_PRENORM+C_MANT+4+1)'(Mant_in_DI) << (Mant_shAmt2_D) );
Mant_norm_D = temp[C_MANT_PRENORM+C_MANT+4:C_MANT_PRENORM];
end
always_comb
begin
Mant_sticky_D = 1'b0;
if (Mant_shAmt2_D <= 0)
Mant_sticky_D = | Mant_in_DI;
else if (Mant_shAmt2_D <= C_MANT_PRENORM)
Mant_sticky_D = | (Mant_in_DI << (Mant_shAmt2_D));
end
//adjust exponent
assign Exp_norm_D = Exp_in_DI - (C_EXP_PRENORM)'(signed'(Mant_leadingOne_D)) + 1 + Denormals_exp_add_D;
//Explanation of the +1 since I'll probably forget:
//we get numbers in the format xx.x...
//but to make things easier we interpret them as
//x.xx... and adjust the exponent accordingly
assign Exp_rounded_D = Exp_norm_D + Mant_renorm_S;
assign Exp_final_D = Exp_rounded_D[C_EXP-1:0];
always_comb //detect exponent over/underflow
begin
Exp_OF_SO = 1'b0;
Exp_UF_SO = 1'b0;
if (Exp_rounded_D >= signed'({2'b0,C_EXP_INF})) //overflow
begin
Exp_OF_SO = 1'b1;
end
else if (Exp_rounded_D <= signed'({2'b0,C_EXP_ZERO})) //underflow
begin
Exp_UF_SO = 1'b1;
end
end
/
// Rounding //
/
logic [C_MANT:0] Mant_upper_D;
logic [3:0] Mant_lower_D;
logic [C_MANT+1:0] Mant_upperRounded_D;
logic Mant_roundUp_S;
logic Mant_rounded_S;
assign Mant_lower_D = Mant_norm_D[3:0];
assign Mant_upper_D = Mant_norm_D[C_MANT+4:4];
assign Mant_rounded_S = (|(Mant_lower_D)) | Mant_sticky_D;
always_comb //determine whether to round up or not
begin
Mant_roundUp_S = 1'b0;
case (RM_SI)
C_RM_NEAREST :
Mant_roundUp_S = Mant_lower_D[3] && (((| Mant_lower_D[2:0]) | Mant_sticky_D) || Mant_upper_D[0]);
C_RM_TRUNC :
Mant_roundUp_S = 0;
C_RM_PLUSINF :
Mant_roundUp_S = Mant_rounded_S & ~Sign_in_DI;
C_RM_MINUSINF:
Mant_roundUp_S = Mant_rounded_S & Sign_in_DI;
default :
Mant_roundUp_S = 0;
endcase // case (RM_DI)
end // always_comb begin
assign Mant_upperRounded_D = Mant_upper_D + Mant_roundUp_S;
assign Mant_renorm_S = Mant_upperRounded_D[C_MANT+1];
/
// Output Assignments //
/
assign Mant_res_DO = Mant_upperRounded_D >> (Mant_renorm_S & ~Denormal_S);
assign Exp_res_DO = Exp_final_D;
assign Rounded_SO = Mant_rounded_S;
endmodule // fpu_norm
为了方便输入测试,编写单精度浮点乘法器顶层文件fpu_mult_inst.sv如下:
module fpu_mult_inst(
input logic clk,
input logic rst_n,
input logic [31:0] A,
input logic [31:0] B,
input logic Enable,
output logic [31:0] Result,
output logic Result_valid
);
logic Sign_a_DI;
logic Sign_b_DI;
logic [7:0] Exp_a_DI;
logic [7:0] Exp_b_DI;
logic [23:0] Mant_a_DI;
logic [23:0] Mant_b_DI;
logic HB_a;
logic HB_b;
logic [22:0] Mant_Re;
logic [7:0] Exp_Re;
logic PreNorm_Z_sign;
logic [24:0] PreNorm_Z_Exp;
logic [47:0] PreNorm_Z_Mant;
logic PreNorm_Z_sign_delay;
logic [24:0] PreNorm_Z_Exp_delay;
logic [47:0] PreNorm_Z_Mant_delay;
assign HB_a = | Exp_a_DI;
assign HB_b = | Exp_b_DI;
assign Sign_a_DI = Enable ? A[31] : 0;
assign Sign_b_DI = Enable ? B[31] : 0;
assign Exp_a_DI = Enable ? A[30:23] : 0;
assign Exp_b_DI = Enable ? B[30:23] : 0;
assign Mant_a_DI = Enable ? {HB_a,A[22:0]} : 0;
assign Mant_b_DI = Enable ? {HB_b,B[22:0]} : 0;
fpu_mult #(
.C_MANT_PRENORM(48),
.C_EXP_PRENORM(10),
.C_EXP(8),
.C_MANT(23),
.C_BIAS(127)
) fpu_mult_inst0 (//Input
.Sign_a_DI(Sign_a_DI),
.Sign_b_DI(Sign_b_DI),
.Exp_a_DI(Exp_a_DI),
.Exp_b_DI(Exp_b_DI),
.Mant_a_DI(Mant_a_DI),
.Mant_b_DI(Mant_b_DI),
//Output
.Sign_prenorm_DO(PreNorm_Z_sign),
.Exp_prenorm_DO(PreNorm_Z_Exp),
.Mant_prenorm_DO(PreNorm_Z_Mant)
);
always @(posedge clk, negedge rst_n)
begin
if(!rst_n) begin
PreNorm_Z_sign_delay <= 0;
PreNorm_Z_Exp_delay <= 0;
PreNorm_Z_Mant_delay <= 0;
Result_valid <= 0;
end
else begin
PreNorm_Z_sign_delay <= PreNorm_Z_sign;
PreNorm_Z_Exp_delay <= PreNorm_Z_Exp;
PreNorm_Z_Mant_delay <= PreNorm_Z_Mant;
Result_valid <= Enable;
end
end
fpu_norm #(
.C_MANT_PRENORM(48),
.C_EXP_PRENORM(10),
.C_MANT_PRENORM_IND(6),
.C_EXP_ZERO(8'h00),
.C_EXP_INF(8'hff),
.C_RM(3),
.C_CMD(4),
.C_MANT(23),
.C_EXP(8),
.C_FPU_ADD_CMD(0),
.C_FPU_SUB_CMD(1),
.C_FPU_MUL_CMD(2),
.C_RM_NEAREST(0),
.C_RM_TRUNC(1),
.C_RM_PLUSINF(3),
.C_RM_MINUSINF(2)
) fn_inst (
//Input Operands
.Mant_in_DI(PreNorm_Z_Mant_delay),
.Exp_in_DI(PreNorm_Z_Exp_delay),
.Sign_in_DI(PreNorm_Z_sign_delay),
//Rounding Mode
.RM_SI(0),
.OP_SI(2),
.Mant_res_DO(Mant_Re),
.Exp_res_DO(Exp_Re),
.Rounded_SO(),
.Exp_OF_SO(),
.Exp_UF_SO()
);
assign Result = {PreNorm_Z_sign_delay, Exp_Re, Mant_Re};
endmodule
仿真测试文件编写如下:
`timescale 1ns/1ns
module tb_fpu_mult;
reg clk;
reg rst_n;
reg [31:0] A;
reg [31:0] B;
reg Enable;
wire [31:0] Z;
wire Result_valid;
initial begin
clk = 0;
Enable = 0;
A = 0;
B = 0;
rst_n = 0;
repeat(2) @(posedge clk);
rst_n = 1;
repeat(5) @(posedge clk);
Enable = 1;
A = 32'h3f9df3b6;
B = 32'h3d59e800;
repeat(1) @(posedge clk);
A = 32'h4007df3b;
B = 32'h3c6d9100;
repeat(1) @(posedge clk);
A = 32'h43550000;
B = 32'h3cbe7a80;
repeat(1) @(posedge clk);
A = 32'h45053000;
B = 32'h3caf8380;
repeat(1) @(posedge clk);
Enable = 0;
repeat(5) @(posedge clk);
$finish;
end
always #20 clk = ~clk;
fpu_mult_inst fmi(
.clk(clk),
.rst_n(rst_n),
.A(A),
.B(B),
.Enable(Enable),
.Result(Z),
.Result_valid(Result_valid)
);
endmodule
波形仿真如下:
利用Vivado2019对该设计进行综合,资源占用情况如下,使用的器件型号为xc7z020clg400-2