HLS的FFT设计步骤
本文基于HLS设计example,FFT > fft_single,其为1024点pipelined streamimg I/O算法。
大体代码流程如下:
1.包含hls_fft.h库
#include "hls_fft.h"
2.设置预先定义的hls::ip_fft::params_t命名空间中的类成员
struct hls::ip_fft::params_t
3.定义运行时设置,这时axis-4接口的一部分,用于动态调节FFT的一些参数
4.调用FFT函数
hls::fft<param1> (xn1, xk1, &fft_status1, &fft_config1);
5.检查运行结果,该过程可选,用于检测是否溢出。
us详细过程如下:
首先定义一个头文件fft_top.h,该文件include进FFT头文件。
前几行的定义输入/输出数据是16bit,FFT点数是(1<<10)1024点。
紧接着params_t的ordering_opt和config_width长度进行了重新设置,一个是输出顺序,一个是axis-4接口配置数据位宽。
typedef ap_fixed<FFT_INPUT_WIDTH,1> data_in_t;
将数据变成了以1位表示整数,15bit表示浮点数。
然后声明了三个函数。第一个函数用于声明axis-4参数设置,第二个函数是fft结果的status判断,第三个函数是vivado中fft ip调用。
#include "ap_fixed.h"
#include "hls_fft.h"
// configurable params
const char FFT_INPUT_WIDTH = 16;
const char FFT_OUTPUT_WIDTH = FFT_INPUT_WIDTH;
const char FFT_CONFIG_WIDTH = 16;
const char FFT_NFFT_MAX = 10;
const int FFT_LENGTH = 1 << FFT_NFFT_MAX;
#include <complex>
using namespace std;
struct config1 : hls::ip_fft::params_t {
static const unsigned ordering_opt = hls::ip_fft::natural_order;
static const unsigned config_width = FFT_CONFIG_WIDTH;
};
typedef hls::ip_fft::config_t<config1> config_t;
typedef hls::ip_fft::status_t<config1> status_t;
typedef ap_fixed<FFT_INPUT_WIDTH,1> data_in_t;
typedef ap_fixed<FFT_OUTPUT_WIDTH,FFT_OUTPUT_WIDTH-FFT_INPUT_WIDTH+1> data_out_t;
typedef std::complex<data_in_t> cmpxDataIn;
typedef std::complex<data_out_t> cmpxDataOut;
void dummy_proc_fe(
bool direction,
config_t* config,
cmpxDataIn in[FFT_LENGTH],
cmpxDataIn out[FFT_LENGTH]);
void dummy_proc_be(
status_t* status_in,
bool* ovflo,
cmpxDataOut in[FFT_LENGTH],
cmpxDataOut out[FFT_LENGTH]);
void fft_top(
bool direction,
cmpxDataIn in[FFT_LENGTH],
cmpxDataOut out[FFT_LENGTH],
bool* ovflo);
有了头文件,接下来就是testbench调用fft方法来完成fft了。首先读入16bit数,然后转换成浮点数
int main()
{
const int SIM_FRAMES = 1;
const int SAMPLES = (1 << FFT_NFFT_MAX);
int error_num = 0;
bool ovflo_all = false;
char res_filename[BUF_SIZE]={0};
char dat_filename[BUF_SIZE]={0};
static cmpxDataIn xn_input[SAMPLES];
static cmpxDataOut xk_output[SAMPLES];
for (int frame = 0; frame < SIM_FRAMES; ++frame)
{
int NFFT = 0;
int CP_LEN = 0; // length of the cyclic prefix to be inserted for each frame
int FWD_INV = 0;
int sc_sch = 0;
int line_no = 1;
FILE *stimfile;
// Open stimulus .dat file for reading
sprintf(dat_filename, "stimulus_%02d", frame);
strcat(dat_filename,".dat");
stimfile = fopen(dat_filename, "r");
int tmp_re, tmp_im;
float dummy_re, dummy_im;
const int max = 1 << FFT_INPUT_WIDTH; // might not work for > 32 bits!
const int max_half_minus_one = (max/2)-1;
// Scaling factor to get integer into -1 <= x < +1 range
const double sc = ldexp(1.0, FFT_INPUT_WIDTH-1); // might not work for > 32 bits!
if (stimfile == NULL)
{
printf("ERROR: Can't open %s\n",dat_filename);
exit(999);
}
else
{
printf("INFO: Reading %s\n",dat_filename);
while (fgetc(stimfile) != EOF && line_no < SAMPLES+5)
{
switch (line_no)
{
case 1:
// Point size
fscanf(stimfile,"%X",&NFFT);
printf("NFFT %d\n",NFFT);
break;
case 2:
// CP length
fscanf(stimfile,"%X",&CP_LEN);
printf("CP_LEN %d\n",CP_LEN);
break;
case 3:
// fwd-inv
fscanf(stimfile,"%X",&FWD_INV);
printf("FWD_INV %d\n",FWD_INV);
break;
case 4:
// Scaling schedule sc_sch
fscanf(stimfile,"%X",&sc_sch);
printf("sc_sch %X\n",sc_sch);
break;
default:
// hex data (first 2 columns)
fscanf(stimfile,"%x %x %f %f",&tmp_re,&tmp_im,&dummy_re,&dummy_im);
//printf("%x %x\n",tmp_re,tmp_im);
double input_data_re, input_data_im;
if (tmp_re > max_half_minus_one) {
input_data_re = ((tmp_re-65536)/sc);
} else {
input_data_re = (tmp_re/sc);
}
//xn_input[line_no-5].re = input_data_re;
//xn_re_hw[line_no-5] = dummy_re;
if (tmp_im > max_half_minus_one) {
input_data_im = ((tmp_im-65536)/sc);
} else {
input_data_im = (tmp_im/sc);
}
//xn_input[line_no-5].im = input_data_im;
//xn_im_hw[line_no-5] = dummy_im;
xn_input[line_no-5] = cmpxDataIn(input_data_re, input_data_im);
}
line_no++;
}
}
fclose(stimfile);
然后调用fft和完成fft变换
fft_top(FWD_INV, xn_input, xk_output, &ovflo);
至此,fft已经算是完成了,接下来就是读入预先评估的result,比对结果了。
FILE* resfile;
sprintf(res_filename, "stimulus_%02d", frame);
strcat(res_filename,".res");
if ((resfile = fopen(res_filename, "r")) == 0)
{
printf("ERROR: Can't open %s\n", res_filename);
exit(888);
}
int tmp;
fscanf(resfile, "%X", &tmp);
fscanf(resfile, "%X", &tmp);
for (int i = 0; i < (1<<NFFT); i++)
{
fscanf(resfile,"%x %x %f %f", &tmp_re, &tmp_im, &dummy_re, &dummy_im);
data_out_t golden = dummy_re;
//if (golden != xk_output[i].re)
if (golden != xk_output[i].real())
{
error_num++;
cout << "Frame:" << frame << " index: " << i
<< " Golden: " << golden.to_float() << " vs. RE Output: " << setprecision(14) << xk_output[i].real().to_float() << endl;
}
golden = dummy_im;
//if (golden != xk_output[i].im)
if (golden != xk_output[i].imag())
{
error_num++;
cout << "Frame:" << frame << " index: " << i
<< " Golden: " << golden.to_float() << " vs. IM Output: " << setprecision(14) << xk_output[i].imag().to_float() << endl;
}
}
fclose(resfile);
}
cout << " ERRORS: " << error_num << endl;
if (error_num > 0)
cout << " (FAILED!!!)" << endl;
else if (ovflo_all)
cout << " (OVERFLOW!!!)" << endl;
else
cout << " (PASSED!!!)" << endl;
if (error_num > 0)
return 1;
else
return 0;
}
其调用的fft所在的文件是fft_top是fft_top.c文件里的函数,该函数将被做成ip,
#include "fft_top.h"
void dummy_proc_fe(
bool direction,
config_t* config,
cmpxDataIn in[FFT_LENGTH],
cmpxDataIn out[FFT_LENGTH])
{
int i;
config->setDir(direction);
config->setSch(0x2AB);
for (i=0; i< FFT_LENGTH; i++)
out[i] = in[i];
}
void dummy_proc_be(
status_t* status_in,
bool* ovflo,
cmpxDataOut in[FFT_LENGTH],
cmpxDataOut out[FFT_LENGTH])
{
int i;
for (i=0; i< FFT_LENGTH; i++)
out[i] = in[i];
*ovflo = status_in->getOvflo() & 0x1;
}
void fft_top(
bool direction,
complex<data_in_t> in[FFT_LENGTH],
complex<data_out_t> out[FFT_LENGTH],
bool* ovflo)
{
#pragma HLS interface ap_hs port=direction
#pragma HLS interface ap_fifo depth=1 port=ovflo
#pragma HLS interface ap_fifo depth=FFT_LENGTH port=in,out
#pragma HLS data_pack variable=in
#pragma HLS data_pack variable=out
#pragma HLS dataflow
complex<data_in_t> xn[FFT_LENGTH];
complex<data_out_t> xk[FFT_LENGTH];
config_t fft_config;
status_t fft_status;
dummy_proc_fe(direction, &fft_config, in, xn);
// FFT IP
hls::fft<config1>(xn, xk, &fft_status, &fft_config);
dummy_proc_be(&fft_status, ovflo, xk, out);
}
dummy_proc_fe函数做了两个工作,一个是设置是fft还是逆fft,另外一个是设置scale值即0x2ab,也即[2 2 2 2 3],即蝶形算法的每一级右移的位数,这样确保最后的结果也是16位的。所以scale对应的十进制值是2^2+2^2+2^2+2^2+2^3=2048。
再来看看仿真的数据和结果:
数据见stimulus_00.dat文件,这里截取部分片段:
0A
0
1
2AB
A437 4C07 -0.71707153320312 0.59396362304688
6015 333C 0.75064086914062 0.40026855468750
B251 FFA6 -0.60690307617188 -0.00274658203125
FD76 85F5 -0.01983642578125 -0.95346069335938
stimulus_00.res文件的结果如下(部分):
00
0
0076 FEF9 0.00360107421875 -0.00802612304688
00C5 009C 0.00601196289062 0.00476074218750
00DD FEAF 0.00674438476562 -0.01028442382812
0103 0014 0.00790405273438 0.00061035156250
0052 0048 0.00250244140625 0.00219726562500
011F 00D5 0.00875854492188 0.00650024414062
FFE9 FFFA -0.00070190429688 -0.00018310546875
FF2C 013F -0.00646972656250 0.00973510742188
前面两行的并没有什么用,实际使用时发现如果不在真正数据前放写数,读入会出错,似乎是vivado_hls的一个bug。
matlab的结果如下(部分):
7.511932373046726 - 16.318267822265767i
12.4071061347653 + 9.84830812777682i
13.9168538790000 - 20.9499609691262i
16.3122678174807 + 1.35817962557640i
5.22772324927469 + 4.60450708754723i
18.0549676502232 + 13.4125569611171i
-1.34364834348118 - 0.280272736963835i
-13.1356602407057 + 20.0124305673796i
0.454103601471324 + 22.6374966808317i
-6.94603384852326 - 1.77684245234773i
-0.526709138800964 - 38.7395841354917i
14.6932655039269 + 6.50847432331556i
11.2043508653130 + 23.3736128226132i
20.8573462890038 - 4.58878154007059i
3.49584233061040 - 7.66236741383999i
-30.2315244786224 - 20.9903229972919i
这里我开始也是困惑了,和matlab结果相差比较大。这是由于scale的原因。
7.51193237304673/2048= 0.0037
<pre name="code" class="cpp">16.318267822265767/2048 = 0.008
其它项依次类推。
值得注意的是,由于采用了scaling策略,这必然会导致可能存在一些问题,比如如果做了fft然后将两个fft的一维数组进行共轭相乘,然后在ifft,这样的话存在一个问题,那就是scaling可能存在问题,所以这时可能采用float型更合适,修改的方法也很简单,只需要将fft_top.h中关于data_in_t和data_out_t进行重新定义,定义的方法如下:
typedef float data_in_t;
typedef float data_out_t;
这样就会调用浮点数IP核进行运算了。
但是还有一个地方需要更改,是因为浮点数要求phase factor必须是24或者25bit的。
struct config1 : hls::ip_fft::params_t {
static const unsigned ordering_opt = hls::ip_fft::natural_order;
static const unsigned phase_factor_width = 24;
static const unsigned config_width = FFT_CONFIG_WIDTH;
};
IP的参数设置必须满足以下的要求:
/ IP parameters legality checking /
// Check CONFIG_T::config_width
config_ch->checkBitWidth(FFT_DATA_FORMAT);
// Check CONFIG_T::status_width
status->checkBitWidth();
// Check ip parameters
if (CONFIG_T::channels < 1 || CONFIG_T::channels > 12)
{
std::cerr << ip_fft::fftErrChkHead << "Channels = " << (int)CONFIG_T::channels
<< " is illegal. It should be from 1 to 12."
<< std::endl;
exit(1);
}
if (CONFIG_T::max_nfft < 3 || CONFIG_T::max_nfft > 16)
{
std::cerr << ip_fft::fftErrChkHead << "NFFT_MAX = " << (int)CONFIG_T::max_nfft
<< " is illegal. It should be from 3 to 16."
<< std::endl;
exit(1);
}
unsigned length = FFT_LENGTH;
if (!CONFIG_T::has_nfft)
{
if (FFT_LENGTH != (1 << CONFIG_T::max_nfft))
{
std::cerr << ip_fft::fftErrChkHead << "FFT_LENGTH = " << (int)FFT_LENGTH
<< " is illegal. Log2(FFT_LENGTH) should equal to NFFT_MAX when run-time configurable length is disabled."
<< std::endl;
exit(1);
}
}
else if (length & (length - 1))
{
std::cerr << ip_fft::fftErrChkHead << "FFT_LENGTH = " << (int)FFT_LENGTH
<< " is illegal. It should be the integer power of 2."
<< std::endl;
exit(1);
}
else if (NFFT < 3 || NFFT > 16)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_LENGTH = " << (int)FFT_LENGTH
<< " is illegal. Log2(FFT_LENGTH) should be from 3 to 16."
<< std::endl;
exit(1);
}
else if (NFFT > CONFIG_T::max_nfft)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_LENGTH = " << (int)FFT_LENGTH
<< " is illegal. Log2(FFT_LENGTH) should be less than or equal to NFFT_MAX."
<< std::endl;
exit(1);
}
#if 0
else if (NFFT != config_ch->getNfft())
{
std::cerr << ip_fft::fftErrChkHead << "FFT_LENGTH = " << (int)FFT_LENGTH
<< " is illegal. Log2(FFT_LENGTH) should equal to NFFT field of configure channel."
<< std::endl;
exit(1);
}
#endif
if ((FFT_INPUT_WIDTH < 8) || (FFT_INPUT_WIDTH > 40))
{
std::cerr << ip_fft::fftErrChkHead << "FFT_INPUT_WIDTH = " << (int)FFT_INPUT_WIDTH
<< " is illegal. It should be 8,16,24,32,40."
<< std::endl;
exit(1);
}
if (CONFIG_T::scaling_opt == ip_fft::unscaled && FFT_DATA_FORMAT != ip_fft::floating_point)
{
unsigned golden = FFT_INPUT_WIDTH + CONFIG_T::max_nfft + 1;
golden = ((golden + 7) >> 3) << 3;
if (FFT_OUTPUT_WIDTH != golden)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_OUTPUT_WIDTH = " << (int)FFT_OUTPUT_WIDTH
<< " is illegal with unscaled arithmetic. It should be input_width+nfft_max+1."
<< std::endl;
exit(1);
}
}
else if (FFT_OUTPUT_WIDTH != FFT_INPUT_WIDTH)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_OUTPUT_WIDTH = " << (int)FFT_OUTPUT_WIDTH
<< " is illegal. It should be the same as input_width."
<< std::endl;
exit(1);
}
if (CONFIG_T::channels > 1 && CONFIG_T::arch_opt == ip_fft::pipelined_streaming_io)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_CHANNELS = " << (int)CONFIG_T::channels << " and FFT_ARCH = pipelined_streaming_io"
<< " is illegal. pipelined_streaming_io architecture is not supported when channels is bigger than 1."
<< std::endl;
exit(1);
}
if (CONFIG_T::channels > 1 && FFT_DATA_FORMAT == ip_fft::floating_point)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_CHANNELS = " << (int)CONFIG_T::channels
<< " is illegal with floating point data format. Floating point data format only supports 1 channel."
<< std::endl;
exit(1);
}
if (FFT_DATA_FORMAT == ip_fft::floating_point)
{
if (CONFIG_T::phase_factor_width != 24 && CONFIG_T::phase_factor_width != 25)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_PHASE_FACTOR_WIDTH = " << (int)CONFIG_T::phase_factor_width
<< " is illegal with floating point data format. It should be 24 or 25."
<< std::endl;
exit(1);
}
}
else if (CONFIG_T::phase_factor_width < 8 || CONFIG_T::phase_factor_width > 34)
{
std::cerr << ip_fft::fftErrChkHead << "FFT_PHASE_FACTOR_WIDTH = " << (int)CONFIG_T::phase_factor_width
<< " is illegal. It should be from 8 to 34."
<< std::endl;
exit(1);
}