作者jhshao原来的openofdm实例化了ise的复数乘法IP,此复数乘法器是流水线设计,3级延迟,另外输入输出各自加了一级寄存器(可能是但是作者在是N210的老器件上进行编译时序紧张才补充的),总共5级,strobe对应做了5级延迟。代码如下:
module complex_mult
(
input clock,
input enable,
input reset,
input [15:0] a_i,
input [15:0] a_q,
input [15:0] b_i,
input [15:0] b_q,
input input_strobe,
output reg [31:0] p_i,
output reg [31:0] p_q,
output output_strobe
);
localparam DELAY = 4;
reg [DELAY-1:0] delay;
reg [15:0] ar;
reg [15:0] ai;
reg [15:0] br;
reg [15:0] bi;
wire [31:0] prod_i;
wire [31:0] prod_q;
complex_multiplier mult_inst (
.clk(clock),
.ar(ar),
.ai(ai),
.br(br),
.bi(bi),
.pr(prod_i),
.pi(prod_q)
);
delayT #(.DATA_WIDTH(1), .DELAY(5)) stb_delay_inst (
.clock(clock),
.reset(reset),
.data_in(input_strobe),
.data_out(output_strobe)
);
always @(posedge clock) begin
if (reset) begin
ar <= 0;
ai <= 0;
br <= 0;
bi <= 0;
p_i <= 0;
p_q <= 0;
delay <= 0;
end else if (enable) begin
ar <= a_i;
ai <= a_q;
br <= b_i;
bi <= b_q;
p_i <= prod_i;
p_q <= prod_q;
end
end
endmodule
后来焦博士将复数乘法的IP替换成了VIVADO的下的,实例代码如下:
module complex_mult
(
input clock,
input enable,
input reset,
input [15:0] a_i,
input [15:0] a_q,
input [15:0] b_i,
input [15:0] b_q,
input input_strobe,
output reg [31:0] p_i,
output reg [31:0] p_q,
output output_strobe
);
localparam DELAY = 4;
reg [DELAY-1:0] delay;
reg [15:0] ar;
reg [15:0] ai;
reg [15:0] br;
reg [15:0] bi;
wire [31:0] prod_i;
wire [31:0] prod_q;
// instantiation of complex multiplier
wire [31:0] s_axis_a_tdata;
assign s_axis_a_tdata = {ai,ar} ;
wire [31:0] s_axis_b_tdata;
assign s_axis_b_tdata = {bi, br} ;
wire [63:0] m_axis_dout_tdata;
assign prod_q = m_axis_dout_tdata[63:32];
assign prod_i = m_axis_dout_tdata[31:0];
wire m_axis_dout_tvalid ; // first try not use it
complex_multiplier mult_inst (
.aclk(clock), // input wire aclk
.s_axis_a_tvalid(input_strobe), // input wire s_axis_a_tvalid
.s_axis_a_tdata(s_axis_a_tdata), // input wire [31 : 0] s_axis_a_tdata
.s_axis_b_tvalid(input_strobe), // input wire s_axis_b_tvalid
.s_axis_b_tdata(s_axis_b_tdata), // input wire [31 : 0] s_axis_b_tdata
.m_axis_dout_tvalid(m_axis_dout_tvalid), // output wire m_axis_dout_tvalid
.m_axis_dout_tdata(m_axis_dout_tdata) // output wire [63 : 0] m_axis_dout_tdata
);
delayT #(.DATA_WIDTH(1), .DELAY(5)) stb_delay_inst (
.clock(clock),
.reset(reset),
.data_in(input_strobe),
.data_out(output_strobe)
);
always @(posedge clock) begin
if (reset) begin
ar <= 0;
ai <= 0;
br <= 0;
bi <= 0;
p_i <= 0;
p_q <= 0;
delay <= 0;
end else if (enable) begin
ar <= a_i;
ai <= a_q;
br <= b_i;
bi <= b_q;
p_i <= prod_i;
p_q <= prod_q;
end
end
endmodule
这里也是配齐了5级延迟。我认为必要性不大,改成了三级延迟,从原理将out_strobe对应输出时序就可以,实践上在原来的逻辑前仿真中通过。
考虑到复数乘法的公式并不复杂(a+bi)(c+di)=(ac-bd)+(bc+ad)i,另外这些乘法运算综合器会自动调用DSP模块来实例化实现,因此我们完全可以尝试写成RTL的形式。下面是我写的代码:
module complex_mult
(
input clock,
input enable,
input reset,
input [15:0] a_i, a_q, b_i, b_q,
input input_strobe,
output reg [31:0] p_i, p_q,
output output_strobe
);
wire aclk =clock;
reg signed [15:0] a ,b,c,d ;
always @ (*) b = a_q ;//im
always @ (*) a = a_i ;//re
always @ (*) d = b_q ;
always @ (*) c = b_i ;
//(a+bi)(c+di)=(ac-bd)+(bc+ad)i
reg signed [31:0]ac,bd,bc,ad ;
always @ (posedge aclk ) ac <= (a * c ); //DSP cores for multiplier use
always @ (posedge aclk ) bd <= (b * d ) ;
always @ (posedge aclk ) bc <= (b * c ) ;
always @ (posedge aclk ) ad <= (a * d ) ;// stage 0
reg signed [31:0] ac_m_bd ,bc_p_ad ;
always @ (posedge aclk ) ac_m_bd <= (ac - bd);//+ ~bd + 1 ;//better than (ac - bd)
always @ (posedge aclk ) bc_p_ad <= (bc + ad );// stage 1
//stage 2
always @ (posedge aclk ) p_i <= ac_m_bd[31:0] ; //3
always @ (posedge aclk ) p_q <= bc_p_ad[31:0] ; //3
// 3 stages for better retiming
delayT #(.DATA_WIDTH(1), .DELAY(3)) complex_mult_delay_inst (
.clock(clock),
.reset(reset),
.data_in(input_strobe),
.data_out(output_strobe)
);
endmodule
这里顺便给大家看看delayT的实现:
module delayT
#(
parameter DATA_WIDTH = 32,
parameter DELAY = 1
)
(
input clock,
input reset,
input [DATA_WIDTH-1:0] data_in,
output [DATA_WIDTH-1:0] data_out
);
reg [DATA_WIDTH-1:0] ram[DELAY-1:0];
integer i;
assign data_out = ram[DELAY-1];
always @(posedge clock) begin
if (reset) begin
for (i = 0; i < DELAY; i = i+1) begin
ram[i] <= 0;
end
end else begin
ram[0] <= data_in;
for (i = 1; i < DELAY; i= i+1) begin
ram[i] <= ram[i-1];
end
end
end
endmodule
就是位数和周期都可以配置的延迟线。我在BLOG里面还有另外一种实现,五六年前写的,有兴趣大家可以找找看看。
这个延迟线的参数化实现代码非常经典,大家可以学会,在此基础上改成可配置的2的整数次幂的滑动滤波器。
另外还有一个complex_mult模块我也做了RTL替代,实现代码如下:
module complex_mult_simple ( // by liwei
input clk,
input [15:0]ar,ai,br,bi,
output reg [31:0] pr ,pi ) ;
wire aclk =clk;
reg signed [15:0] a ,b,c,d ;
always @ (* ) b = ai;///s_axis_a_tdata[31:16] ; //im
always @ (* ) a = ar ;//s_axis_a_tdata[15:0] ; //re
always @ (*) d = bi;//s_axis_b_tdata[31:16] ;
always @ (*) c = br;//s_axis_b_tdata[15:0] ;
reg signed [31:0]ac,bd,bc,ad ;
always @ (posedge aclk ) ac <= (a * c ); //DSP cores for multiplier use
always @ (posedge aclk ) bd <= (b * d ) ;
always @ (posedge aclk ) bc <= (b * c ) ;
always @ (posedge aclk ) ad <= (a * d ) ;// stage 0
reg signed [31:0] ac_m_bd ,bc_p_ad ;
always @ (posedge aclk ) ac_m_bd <= (ac - bd);//+ ~bd + 1 ;//better than (ac - bd)
always @ (posedge aclk ) bc_p_ad <= (bc + ad );// stage 1
//stage 2
always @ (posedge aclk ) pr <= ac_m_bd[31:0] ; //3
always @ (posedge aclk ) pi <= bc_p_ad[31:0] ; //3
// 3 stages for better retiming
endmodule
做这个替换我们公式不会写错,一定要注意的是real部分和image部分对应。