1. 带符号数乘法原理
带符号数乘法原理以及详细计算过程可以参考以下文章。
2. 单周期乘法器组合逻辑实现
module mult(
input signed [5:0] mult_in1,
input signed [7:0] mult_in2,
output signed [12:0] mult_out
);
assign mult_out = mult_in1 * mult_in2;
endmodule
组合逻辑的实现比较简单,直接调用*就可以实现。然而,在不考虑逻辑优化的情况下,一个6bits乘8bits的组合逻辑,需要7个加法器。当位宽增大时,乘法的资源消耗是巨大的。并且,过长的组合逻辑链的延时也比较大,会限制整个硬件的最高频率。
3. 使用pipeline实现有符号数乘法
如下图,考虑一个6bits无符号数乘8bits无符号数的情况。在使用组合逻辑的情况下,这个运算需要消耗7个加法器。
如果使用pipeline的情况下,我们可以先分别使用2个加法器,花费3个时钟周期,对A部分和B部分分别求和。最后再使用一个加法器实现A与B结果的相加。这样虽然增加了运算的latency,但可以节约面积消耗,提高主频时钟频率。
关于pipeline长度的选择:这里对8个加数分成了两组,latency为3个clock cycle。可以根据应用的需求,选择每组加数的数量,这样会消耗更少资源,但latency也会更高。
刚刚介绍的是无符号数的乘法操作,而对于有符号数,加数需要进行 补符号位的操作,最后一个加数还需要取补码。为了简便verilog的编写,在进行补码相乘时,下面的例子先将补码转换为原码,最后在根据输入符号位再转换成补码。
rtl:
module mult_pipeline(
input clk ,
input rst_n ,
input signed [5:0] mult_in1 ,
input signed [7:0] mult_in2 ,
input mult_en ,
output signed [12:0] mult_out ,
output reg mult_vld
);
wire [5:0] mult_in1_src;
wire [7:0] mult_in2_src;
wire [12:0] mult_out_src;
reg [8:0] add_in [0:1];
reg [9:0] add_out [0:1];
reg [1:0] cnt;
assign mult_in1_src = mult_in1[5] ? (~mult_in1 + 6'd1) : mult_in1;
assign mult_in2_src = mult_in2[7] ? (~mult_in2 + 8'd1) : mult_in2;
assign mult_out_src = add_out[0] + {add_out[1],4'd0};
assign mult_out = mult_in1[5] ^ mult_in2[7] ? ((~mult_out_src) + 1'd1) : mult_out_src;
genvar i;
generate
for(i = 0; i < 2; i = i + 1) begin
always @(posedge clk or negedge rst_n)
if(!rst_n)
add_out[i] <= 10'd0;
else if(mult_en)
add_out[i] <= mult_in2_src[4*i] ? {3'd0, mult_in1_src} : 9'd0;
else if(cnt != 2'd3)
add_out[i] <= add_out[i] + add_in[i];
always @(*) begin
case(cnt)
2'd0: add_in[i] = mult_in2_src[4*i + 1] ? {2'd0, mult_in1_src, 1'd0} : 9'd0;
2'd1: add_in[i] = mult_in2_src[4*i + 2] ? {1'd0, mult_in1_src, 2'd0} : 9'd0;
2'd2: add_in[i] = mult_in2_src[4*i + 3] ? {mult_in1_src, 3'd0} : 9'd0;
default: add_in[i] = 9'd0;
endcase
end
end
endgenerate
always @(posedge clk or negedge rst_n) begin
if(!rst_n)
cnt <= 2'd3;
else if(mult_en)
cnt <= 2'd0;
else if(cnt != 2'd3)
cnt <= cnt + 2'd1;
end
always @(posedge clk or negedge rst_n) begin
if(!rst_n)
mult_vld <= 1'b0;
else
mult_vld <= cnt == 2'd2;
end
endmodule
tb:
module tb_top();
bit clk;
bit rst_n;
bit signed [5:0] mult_in1;
bit signed [7:0] mult_in2;
bit mult_en;
wire signed [12:0] mult_out_pip;
wire signed [12:0] mult_out_comb;
wire mult_vld;
initial begin
clk = 1'b0;
rst_n = 1'b0;
#32;
rst_n = 1'b1;
#10000;
@(posedge clk);
#1;
mult_in1 = -6'sd2;
mult_in2 = -8'sd3;
mult_en = 1'b1;
@(posedge clk);
#1;
mult_en = 1'b0;
repeat(5) @(posedge clk);
mult_in1 = 6'sd2;
mult_in2 = -8'sd12;
mult_en = 1'b1;
@(posedge clk);
#1;
mult_en = 1'b0;
repeat(5) @(posedge clk);
mult_in1 = 6'sd5;
mult_in2 = 8'sd122;
mult_en = 1'b1;
@(posedge clk);
#1;
mult_en = 1'b0;
repeat(5) @(posedge clk);
mult_in1 = -6'sd15;
mult_in2 = 8'sd14;
mult_en = 1'b1;
@(posedge clk);
#1;
mult_en = 1'b0;
repeat(5) @(posedge clk);
end
always #10 clk = ~clk;
mult u_mult(
.mult_in1 (mult_in1),
.mult_in2 (mult_in2),
.mult_out (mult_out_comb)
);
mult_pipeline u_mult_pipeline(
.clk (clk ),
.rst_n (rst_n ),
.mult_in1 (mult_in1),
.mult_in2 (mult_in2),
.mult_en (mult_en ),
.mult_out (mult_out_pip),
.mult_vld (mult_vld)
);
endmodule
仿真结果: 使用pipeline,结果会有4个cycle的latency