层次架构
fast
xFFastCornerDetection
xFfast7x7
ProcessFast(OutputValues)
// 八个xFfastProc并行找角点
xFfastProc(OutputValues,src_buf[WIN_SZ][],win_size,_threshold,pack_corners) //对一个Bresenham圆验证角点
// Main code goes here
// Bresenham's circle score computation
// Bresenham's circle score computation complete
// Decision making for corners
// Corner position computation complete
// NMS Score Computation
xFCoreScore// Comparing scores of the candidate pixel with neighbors in a 3x3 window
xFfastnms
Processfastnms
xfExtractPixels
//xFnmsProc处理3*3
xFnmsProc(OutputValues[8],src_buf[3][8 + 2],3) //src_buf is score
xfPackPixels
_out_mat.write(write_index++, P0);
计算角点
xFfast7x7
计算角点
#define PSize 16
#define NUM 25
xFfast7x7<SRC_T, ROWS, COLS, DEPTH, NPC, WORDWIDTH_SRC, (COLS >> XF_BITSHIFT(NPC)) + (7 >> 1), 7, 7 * 7>
(
_src_mat, _dst, 7, _image_height, _image_width, _threshold);
template <int SRC_T, int ROWS, int COLS, int DEPTH, int NPC,
int WORDWIDTH,
int TC, //(COLS >> XF_BITSHIFT(NPC)) + (7 >> 1)=4+3=7 (7 >> 1)圆半径
int WIN_SZ, //7
int WIN_SZ_SQ> //7 * 7
void xFfast7x7(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,//score
ap_uint<8> win_size, //7
uint16_t img_height,
uint16_t img_width,
uchar_t _threshold) {
ap_uint<13> row_ind[WIN_SZ];
#pragma HLS ARRAY_PARTITION variable=row_ind complete dim=1
XF_PTNAME(DEPTH) pack_corners;
uint16_t shift_x = 0;
ap_uint<13> row, col;
XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)];
#pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=1
// ap_uint<8> src_buf[7][14];
XF_PTNAME(DEPTH) src_buf[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)];
#pragma HLS ARRAY_PARTITION variable=src_buf complete dim=1
#pragma HLS ARRAY_PARTITION variable=src_buf complete dim=2
// src_buf1 et al merged
XF_SNAME(WORDWIDTH) P0;
XF_SNAME(WORDWIDTH) buf[WIN_SZ][(COLS >> XF_BITSHIFT(NPC))]; //[7][]
#pragma HLS ARRAY_PARTITION variable=buf complete dim=1
#pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
// initializing row index
for (int init_row_ind = 0; init_row_ind < win_size; init_row_ind++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
row_ind[init_row_ind] = init_row_ind;
}
int read_index = 0;
int write_index = 0;
read_lines:
//read (line3+n)-(line5+n) row to buf
// for (int init_buf = 3+n; init_buf < 6+n; init_buf++) { //read 3 lines
for (int init_buf = row_ind[win_size >> 1]; init_buf < row_ind[win_size - 1]; init_buf++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
for (col = 0; col<img_width>> XF_BITSHIFT(NPC); col++) {// read one col
#pragma HLS LOOP_TRIPCOUNT min=TC max=TC
#pragma HLS pipeline
#pragma HLS LOOP_FLATTEN OFF
buf[init_buf][col] = _src_mat.read(read_index++);
}
}
// takes care of top borders // //init buf[0:2][]
for (col = 0; col<img_width>> XF_BITSHIFT(NPC); col++) {
#pragma HLS LOOP_TRIPCOUNT min=TC max=TC
// for (int init_buf = 0; init_buf< 3; init_buf++) {
for (int init_buf = 0; init_buf<WIN_SZ>> 1; init_buf++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS UNROLL
buf[init_buf][col] = 0; // buf[row_ind[win_size>>1]][col];
}
}
Row_Loop:
// for (row = 3; row < 32 + 3; row++) {
// for (row = (7 >> 1); row < img_height + (7 >> 1); row++) {
for (row = (win_size >> 1); row < img_height + (win_size >> 1); row++) {
#pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
P0 = 0;
ProcessFast<SRC_T, ROWS, COLS, DEPTH, NPC, WORDWIDTH, TC, WIN_SZ, WIN_SZ_SQ>(
_src_mat, _out_mat, buf, src_buf, OutputValues,
P0,
img_width, img_height, shift_x, row_ind, row, win_size,
_threshold, pack_corners, read_index, write_index);
// update indices
ap_uint<13> zero_ind = row_ind[0];
for (int init_row_ind = 0; init_row_ind < WIN_SZ - 1; init_row_ind++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS UNROLL
row_ind[init_row_ind] = row_ind[init_row_ind + 1];
}
row_ind[win_size - 1] = zero_ind;
} // Row_Loop
}
template <int SRC_T, int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC, int WIN_SZ, int WIN_SZ_SQ>
void ProcessFast(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
XF_SNAME(WORDWIDTH) buf[WIN_SZ][(COLS >> XF_BITSHIFT(NPC))],
XF_PTNAME(DEPTH) src_buf[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)],
XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)],//score
XF_SNAME(WORDWIDTH) & P0,//打包的score
uint16_t img_width,
uint16_t img_height,
uint16_t& shift_x,
ap_uint<13> row_ind[WIN_SZ],
ap_uint<13> row,
ap_uint<8> win_size,
uchar_t _threshold,
XF_PTNAME(DEPTH) & pack_corners, //iscorener
int& read_index,
int& write_index) {
#pragma HLS INLINE
// ap_uint<64> buf_cop[7]
XF_SNAME(WORDWIDTH) buf_cop[WIN_SZ];
#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=1
uint16_t npc = XF_NPIXPERCYCLE(NPC);//8
uint16_t col_loop_var = 0;
if (npc == 1) {
col_loop_var = (WIN_SZ >> 1);
} else {
col_loop_var = 1;
}
// init src_buf
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS unroll
for (int ext_copy = 0; ext_copy < npc + WIN_SZ - 1; ext_copy++) {
#pragma HLS unroll
// src_buf[0:6][0:13] = 0;
src_buf[extract_px][ext_copy] = 0;
}
}
Col_Loop:
// for (col = 0; col < 4 + 1; col++) {
for (ap_uint<13> col = 0; col < ((img_width) >> XF_BITSHIFT(NPC)) + col_loop_var; col++) {
#pragma HLS LOOP_TRIPCOUNT min=TC max=TC
#pragma HLS pipeline
#pragma HLS LOOP_FLATTEN OFF
//row:3-(32+3)
// if (row < 32 && col < 4)
if (row < img_height && col < (img_width >> XF_BITSHIFT(NPC)))
// buf[6][col] = _src_mat.read(read_index++); // Read data
buf[row_ind[win_size - 1]][col] = _src_mat.read(read_index++); //Read new one line
if (NPC == XF_NPPC8) {
//buf_cop[0:6] = buf[0+n:6+n][col]; copy one column of WIN_SZ
for (int copy_buf_var = 0; copy_buf_var < WIN_SZ; copy_buf_var++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS UNROLL
// if ((row > 31) && (copy_buf_var > (6 - (row - 31)))) {
if ((row > (img_height - 1)) && (copy_buf_var > (win_size - 1 - (row - (img_height - 1))))) {
buf_cop[copy_buf_var] = buf[(row_ind[win_size - 1 - (row - (img_height - 1))])][col];
}
else {
if (col < (img_width >> XF_BITSHIFT(NPC)))
// buf_cop[0:6] = buf[0+n:6+n][col];
buf_cop[copy_buf_var] = buf[(row_ind[copy_buf_var])][col];
}
}
XF_PTNAME(DEPTH) src_buf_temp_copy[WIN_SZ][XF_NPIXPERCYCLE(NPC)];//[7][8]
XF_PTNAME(DEPTH) src_buf_temp_copy_extract[XF_NPIXPERCYCLE(NPC)];
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS unroll
XF_SNAME(WORDWIDTH) toextract = buf_cop[extract_px];
xfExtractPixels<NPC, WORDWIDTH, DEPTH>(src_buf_temp_copy_extract, toextract, 0);
for (int ext_copy = 0; ext_copy < npc; ext_copy++) {
#pragma HLS unroll
src_buf_temp_copy[extract_px][ext_copy] = src_buf_temp_copy_extract[ext_copy];
}
}
// for (int extract_px = 0; extract_px < 7; extract_px++) {
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
// for (int col_warp = 0; col_warp < 3; col_warp++) {
for (int col_warp = 0; col_warp < (WIN_SZ >> 1); col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
if (col == img_width >> XF_BITSHIFT(NPC)) {
src_buf[extract_px][col_warp + npc + (WIN_SZ >> 1)] =
src_buf[extract_px][npc + (WIN_SZ >> 1) - 1];
} else {
// ap_uint<8> src_buf[7][8 + 6];
// src_buf[0:6][11:13] = src_buf_temp_copy[0:6][0:2];
src_buf[extract_px][col_warp + npc + (WIN_SZ >> 1)] = src_buf_temp_copy[extract_px][col_warp];
}
}
}
if (col == 0) {
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
for (int col_warp = 0; col_warp < npc + (WIN_SZ >> 1); col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
// src_buf[0:6][0:10] = src_buf_temp_copy[0:6][0];
src_buf[extract_px][col_warp] = src_buf_temp_copy[extract_px][0];
}
}
}
// ap_uint<8> src_buf_temp_med_apply[7][14]
XF_PTNAME(DEPTH) src_buf_temp_med_apply[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)];
//8个并行找角点
for (int applyfast = 0; applyfast < npc; applyfast++) {
#pragma HLS UNROLL
for (int copyi = 0; copyi < WIN_SZ; copyi++) {
for (int copyj = 0; copyj < WIN_SZ; copyj++) {
// src_buf_temp_med_apply[0:6][0:6] = src_buf[0:6][0:6 + 0:7];
// src_buf_temp_med_apply[0:6][0:6] = src_buf[0:6][0:6];
//src_buf[0:6][1:7]
//src_buf[0:6][2:8]
//src_buf[0:6][3:9]
//src_buf[0:6][4:10]
//src_buf[0:6][5:11]
//src_buf[0:6][6:12]
//src_buf[0:6][7:13]
src_buf_temp_med_apply[copyi][copyj] = src_buf[copyi][copyj + applyfast];
}
}
XF_PTNAME(DEPTH) OutputValues_percycle[1];
OutputValues_percycle[0] = 0;
if (row < (img_height) && row >= 6 && (!(col <= 1 && applyfast < 3)) &&
(!(col == (((img_width) >> XF_BITSHIFT(NPC))) && applyfast > 4))) // && (!(col==1 && applyfast<=6)))
{
//对一个Bresenham圆验证角点
xFfastProc<NPC, WORDWIDTH, DEPTH, WIN_SZ, WIN_SZ_SQ>
(OutputValues_percycle,//score
src_buf_temp_med_apply,
WIN_SZ, _threshold,
pack_corners); //iscorner 255 else 0
}
if (row >= img_height) {
OutputValues_percycle[0] = 0;
}
OutputValues[applyfast] = OutputValues_percycle[0];
}
if (col >= 1) {
shift_x = 0;
P0 = 0;
xfPackPixels<NPC, WORDWIDTH, DEPTH>(OutputValues, P0, 0, npc, shift_x);
_out_mat.write(write_index++, P0);
}
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
for (int col_warp = 0; col_warp < (WIN_SZ >> 1); col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
// src_buf[0:6][0:2] = src_buf[0:6][8:10];
src_buf[extract_px][col_warp] = src_buf[extract_px][col_warp + npc];
}
}
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
for (int col_warp = 0; col_warp < npc; col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
// src_buf[0:6][3:10] = src_buf_temp_copy[0:6][0:7];
src_buf[extract_px][col_warp + (WIN_SZ >> 1)] = src_buf_temp_copy[extract_px][col_warp];
}
}
}
} // Col_Loop 64b
}
template <int NPC, int WORDWIDTH, int DEPTH, int WIN_SZ, int WIN_SZ_SQ>
void xFfastProc(XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)], //OutputValues_percycle
XF_PTNAME(DEPTH) src_buf[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)],//src_buf_temp_med_apply
ap_uint<8> win_size, //7
uchar_t _threshold,
//ap_uint<8> & & pack_corners
XF_PTNAME(DEPTH) & pack_corners) //iscorner 255 else 0
{
#pragma HLS INLINE
uchar_t kx = 0, ix = 0;
// ap_uint<8> tbuf_temp;
XF_PTNAME(DEPTH) tbuf_temp = 0;
// Main code goes here
// Bresenham's circle score computation
short int flag_d[(1 << XF_BITSHIFT(NPC))][NUM] = {0}, flag_val[(1 << XF_BITSHIFT(NPC))][NUM] = {0};
#pragma HLS ARRAY_PARTITION variable=flag_val dim=1
#pragma HLS ARRAY_PARTITION variable=flag_d dim=1
for (ap_uint<4> i = 0; i < 1; i++) {
#pragma HLS LOOP_TRIPCOUNT MAX=1
#pragma HLS LOOP_FLATTEN off
#pragma HLS PIPELINE II=1
// Compute the intensity difference between the candidate pixel and pixels on the Bresenham's circle
flag_d[i][0] = src_buf[3][3 + i] - src_buf[0][3 + i]; // tbuf4[3+i] - tbuf1[3+i];
flag_d[i][1] = src_buf[3][3 + i] - src_buf[0][4 + i]; // tbuf4[3+i] - tbuf1[4+i];
flag_d[i][2] = src_buf[3][3 + i] - src_buf[1][5 + i]; // tbuf4[3+i] - tbuf2[5+i];
flag_d[i][3] = src_buf[3][3 + i] - src_buf[2][6 + i]; // tbuf4[3+i] - tbuf3[6+i];
flag_d[i][4] = src_buf[3][3 + i] - src_buf[3][6 + i]; // tbuf4[3+i] - tbuf4[6+i];
flag_d[i][5] = src_buf[3][3 + i] - src_buf[4][6 + i]; // tbuf4[3+i] - tbuf5[6+i];
flag_d[i][6] = src_buf[3][3 + i] - src_buf[5][5 + i]; // tbuf4[3+i] - tbuf6[5+i];
flag_d[i][7] = src_buf[3][3 + i] - src_buf[6][4 + i]; // tbuf4[3+i] - tbuf7[4+i];
flag_d[i][8] = src_buf[3][3 + i] - src_buf[6][3 + i]; // tbuf4[3+i] - tbuf7[3+i];
flag_d[i][9] = src_buf[3][3 + i] - src_buf[6][2 + i]; // tbuf4[3+i] - tbuf7[2+i];
flag_d[i][10] = src_buf[3][3 + i] - src_buf[5][1 + i]; // tbuf4[3+i] - tbuf6[1+i];
flag_d[i][11] = src_buf[3][3 + i] - src_buf[4][0 + i]; // tbuf4[3+i] - tbuf5[0+i];
flag_d[i][12] = src_buf[3][3 + i] - src_buf[3][0 + i]; // tbuf4[3+i] - tbuf4[0+i];
flag_d[i][13] = src_buf[3][3 + i] - src_buf[2][0 + i]; // tbuf4[3+i] - tbuf3[0+i];
flag_d[i][14] = src_buf[3][3 + i] - src_buf[1][1 + i]; // tbuf4[3+i] - tbuf2[1+i];
flag_d[i][15] = src_buf[3][3 + i] - src_buf[0][2 + i]; // tbuf4[3+i] - tbuf1[2+i];
// Repeating the first 9 values
flag_d[i][16] = flag_d[i][0];
flag_d[i][17] = flag_d[i][1];
flag_d[i][18] = flag_d[i][2];
flag_d[i][19] = flag_d[i][3];
flag_d[i][20] = flag_d[i][4];
flag_d[i][21] = flag_d[i][5];
flag_d[i][22] = flag_d[i][6];
flag_d[i][23] = flag_d[i][7];
flag_d[i][24] = flag_d[i][8];
// Classification of pixels on the Bresenham's circle into brighter, darker or similar w.r.t.
// the candidate pixel
for (ap_uint<4> j = 0; j < 8; j++) {
#pragma HLS unroll
if (flag_d[i][j] > _threshold)
flag_val[i][j] = 1;
else if (flag_d[i][j] < -_threshold)
flag_val[i][j] = 2;
else
flag_val[i][j] = 0;
if (flag_d[i][j + 8] > _threshold)
flag_val[i][j + 8] = 1;
else if (flag_d[i][j + 8] < -_threshold)
flag_val[i][j + 8] = 2;
else
flag_val[i][j + 8] = 0;
// Repeating the first 9 values
flag_val[i][j + PSize] = flag_val[i][j];
}
flag_val[i][PSize / 2 + PSize] = flag_val[i][PSize / 2];
flag_d[i][PSize / 2 + PSize] = flag_d[i][PSize / 2];
// Bresenham's circle score computation complete
// Decision making for corners
uchar_t core = 0;
uchar_t iscorner = 0;
uchar_t count = 1;
for (ap_uint<5> c = 1; c < PSize + PSize / 2 + 1; c++) {
#pragma HLS LOOP_TRIPCOUNT MAX=25
#pragma HLS UNROLL
if ((flag_val[i][c - 1] == flag_val[i][c]) && flag_val[i][c] > 0) {
count++;
if (count > PSize / 2) {
iscorner = 1; // Candidate pixel is a corner
}
}
else {
count = 1;
}
} // Corner position computation complete
// NMS Score Computation
if (iscorner) {
xFCoreScore(flag_d[i], _threshold, &core);
pack_corners.range(ix + 7, ix) = 255;
} else
pack_corners.range(ix + 7, ix) = 0;
ix += 8;
// Pack the 8-bit score values into 64-bit words
tbuf_temp.range(kx + 7, kx) = core; // Set bits in a range of positions.设置一个位置范围内的bits。
kx += 8;
}
// return tbuf_temp;
OutputValues[0] = tbuf_temp; // array[(WIN_SZ_SQ)>>1];
return;
}
// coreScore computes the score for corner pixels
// For a given pixel identified as corner in process_function, the theshold is
// increaded by a small value in each iteration till the pixel becomes
// a non-corner. That value of threshold becomes the score for that corner pixel.
static void xFCoreScore(short int* flag_d, int _threshold, uchar_t* core) {
#pragma HLS INLINE
short int flag_d_min2[NUM - 1];
short int flag_d_max2[NUM - 1];
short int flag_d_min4[NUM - 3];
short int flag_d_max4[NUM - 3];
short int flag_d_min8[NUM - 7];
short int flag_d_max8[NUM - 7];
for (ap_uint<5> i = 0; i < NUM - 1; i++) {
flag_d_min2[i] = __MIN(flag_d[i], flag_d[i + 1]);
flag_d_max2[i] = __MAX(flag_d[i], flag_d[i + 1]);
}
for (ap_uint<5> i = 0; i < NUM - 3; i++) {
flag_d_min4[i] = __MIN(flag_d_min2[i], flag_d_min2[i + 2]);
flag_d_max4[i] = __MAX(flag_d_max2[i], flag_d_max2[i + 2]);
}
for (ap_uint<5> i = 0; i < NUM - 7; i++) {
flag_d_min8[i] = __MIN(flag_d_min4[i], flag_d_min4[i + 4]);
flag_d_max8[i] = __MAX(flag_d_max4[i], flag_d_max4[i + 4]);
}
uchar_t a0 = _threshold;
for (ap_uint<5> i = 0; i < PSize; i += 2) {
short int a = 255;
if (PSize == 16) {
a = flag_d_min8[i + 1];
}
// else {
// for(ap_uint<5> j=1;j<PSize/2+1;j++)
// {
// a=__MIN(a,flag_d[i+j]);
// }
// }
a0 = __MAX(a0, __MIN(a, flag_d[i])); // a0 >= _threshold
a0 = __MAX(a0, __MIN(a, flag_d[i + PSize / 2 + 1]));
}
short int b0 = -_threshold;
for (ap_uint<5> i = 0; i < PSize; i += 2) {
short int b = -255;
if (PSize == 16) {
b = flag_d_max8[i + 1];
}
// } else {
// for(ap_uint<5> j=1;j<PSize/2+1;j++)
// {
// b=__MAX(b,flag_d[i+j]);
// }
// }
b0 = __MIN(b0, __MAX(b, flag_d[i])); // b0 <= -_threshold
b0 = __MIN(b0, __MAX(b, flag_d[i + PSize / 2 + 1]));
}
*core = __MAX(a0, -b0) - 1;
} // Core window score computation complete
fast
xFFastCornerDetection
xFfast7x7 //输出score
ProcessFast(OutputValues)
// 八个xFfastProc并行找角点
xFfastProc(OutputValues,src_buf[WIN_SZ][],win_size,_threshold,pack_corners) //对一个Bresenham圆验证角点
// Main code goes here
// Bresenham's circle score computation
// Bresenham's circle score computation complete
// Decision making for corners
// Corner position computation complete
// NMS Score Computation
xFCoreScore/
xFfastnms //output kp的点图 值为255是kp
{//for (row = 1; row < 32 + 1; row++)//整幅图32*32处理
Processfastnms
{// for (ap_uint<13> col = 0; col < 5; col++)
xfExtractPixels
//xFnmsProc处理3*3
xFnmsProc(OutputValues[8],src_buf[3][8 + 2],3) //src_buf is score/ Comparing scores of the candidate pixel with neighbors in a 3x3 window
xfPackPixels
_out_mat.write(write_index++, P0);
NMS
xFFastCornerDetection<SRC_T, ROWS, COLS, XF_DEPTH(SRC_T, NPC),//XF_8UP
NPC, XF_WORDWIDTH(SRC_T, NPC),//XF_WORDWIDTH(XF_8UC1, XF_NPPC8)=XF_64UW
XF_32UW(没有使用), NMS>(
_src_mat , _dst_mat, _src_mat.rows, _src_mat.cols, _threshold);
xFfastnms<SRC_T, ROWS, COLS, DEPTH, NPC, WORDWIDTH_SRC,
(COLS >> XF_BITSHIFT(NPC)) + (3 >> 1),
3,
3 * 3>
(_dst,
_dst_mat,
3,
_image_height,
_image_width
);
//SRC_T: XF_8UC1
//ROWS input_img_height 32 64 128..
//COLS
//DEPTH :XF_8UP
//NPC
//WORDWIDTH :NPC8:wordwidth = XF_64UW= 20 NPC1:wordwidth = XF_8UW = 1,
//TC :(COLS >> XF_BITSHIFT(NPC))+ (3 >> 1)=4 + 1=5
//WIN_SZ:3
//WIN_SZ_SQ: 3*3
template <int SRC_T, int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC, int WIN_SZ, int WIN_SZ_SQ>
void xFfastnms(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
ap_uint<8> win_size,//3
uint16_t img_height,
uint16_t img_width) {
//ap_uint<13> row_ind[3]
ap_uint<13> row_ind[WIN_SZ];//row_index
#pragma HLS ARRAY_PARTITION variable=row_ind complete dim=1
uint16_t shift_x = 0;
ap_uint<13> row, col;
//ap_uint<8> OutputValues[8]
XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)];XF_NPIXPERCYCLE(NPC) = 1 or 8
#pragma HLS ARRAY_PARTITION variable=OutputValues complete dim=1
//ap_uint<8> src_buf[3][8+2]
XF_PTNAME(DEPTH) src_buf[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)];
#pragma HLS ARRAY_PARTITION variable=src_buf complete dim=1
#pragma HLS ARRAY_PARTITION variable=src_buf complete dim=2
// src_buf1 et al merged
//ap_uint<64> P0
XF_SNAME(WORDWIDTH) P0;
//ap_uint<64> buf[3][COLS>>3]
XF_SNAME(WORDWIDTH) buf[WIN_SZ][(COLS >> XF_BITSHIFT(NPC))];
#pragma HLS ARRAY_PARTITION variable=buf complete dim=1
#pragma HLS RESOURCE variable=buf core=RAM_S2P_BRAM
// initializing row index
for (int init_row_ind = 0; init_row_ind < win_size; init_row_ind++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
row_ind[init_row_ind] = init_row_ind;
}
int readind_val = 0, writeind_val = 0;
read_lines:
//for (int init_buf = 1; init_buf < 2;
//for (int init_buf = row_ind[1]; init_buf < row_ind[2];
for (int init_buf = row_ind[win_size >> 1]; init_buf < row_ind[win_size - 1]; init_buf++) {//read row
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
// for (col = 0; col<(img_width>> 3);
for (col = 0; col<img_width>> XF_BITSHIFT(NPC); col++) {//read col
#pragma HLS LOOP_TRIPCOUNT min=TC max=TC
#pragma HLS pipeline
#pragma HLS LOOP_FLATTEN OFF
//ap_uint<64> buf[3][COLS>>3]
// buf[1][col] = _src_mat.read(readind_val++);//??
buf[init_buf][col] = _src_mat.read(readind_val++);//??
}
}
// takes care of top borders 补充边界值
// for (col = 0; col<(img_width>> 3);
for (col = 0; col<img_width>> XF_BITSHIFT(NPC); col++) {
#pragma HLS LOOP_TRIPCOUNT min=TC max=TC
//for (int init_buf = 0; init_buf<1; init_buf++) {
for (int init_buf = 0; init_buf<WIN_SZ>> 1; init_buf++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS UNROLL
//buf[0][col]=buf[1][col]
//buf[0][col]=buf[row_ind[1]][col]
buf[init_buf][col] = buf[row_ind[win_size >> 1]][col];
}
}
Row_Loop:
//for (row = 1; row < 32 + 1; row++) {
for (row = (win_size >> 1); row < img_height + (win_size >> 1); row++) {
#pragma HLS LOOP_TRIPCOUNT min=ROWS max=ROWS
P0 = 0;
Processfastnms<SRC_T, ROWS, COLS, DEPTH, NPC, WORDWIDTH, TC, WIN_SZ, WIN_SZ_SQ>(
_src_mat, _out_mat, buf, src_buf, OutputValues, P0, img_width, img_height, shift_x, row_ind, row, win_size,
readind_val, writeind_val);
// update indices
ap_uint<13> zero_ind = row_ind[0];
for (int init_row_ind = 0; init_row_ind < WIN_SZ - 1; init_row_ind++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS UNROLL
row_ind[init_row_ind] = row_ind[init_row_ind + 1];//下移一行
}
row_ind[win_size - 1] = zero_ind;
} // Row_Loop
}
template <int SRC_T, int ROWS, int COLS, int DEPTH, int NPC, int WORDWIDTH, int TC, int WIN_SZ, int WIN_SZ_SQ>
void Processfastnms(xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _src_mat,
xf::cv::Mat<SRC_T, ROWS, COLS, NPC>& _out_mat,
//ap_uint<64> buf
XF_SNAME(WORDWIDTH) buf[WIN_SZ][(COLS >> XF_BITSHIFT(NPC))],
//ap_uint<8> src_buf[3][8+2]
XF_PTNAME(DEPTH) src_buf[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)],
//ap_uint<8> OutputValues[8]
XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)],
//ap_uint<64> P0
XF_SNAME(WORDWIDTH) & P0,
uint16_t img_width,
uint16_t img_height,
uint16_t& shift_x, //0
ap_uint<13> row_ind[WIN_SZ],//WIN_SZ 3
ap_uint<13> row,//ap_uint<13> row, col;
ap_uint<8> win_size, //3
int& read_index, //readind_val
int& write_index) {
#pragma HLS INLINE
// ap_uint<64> buf_cop[3]
XF_SNAME(WORDWIDTH) buf_cop[WIN_SZ];
#pragma HLS ARRAY_PARTITION variable=buf_cop complete dim=1
uint16_t npc = XF_NPIXPERCYCLE(NPC);//XF_NPIXPERCYCLE(NPC) = 1 or 8
uint16_t col_loop_var = 0;
if (npc == 1) {
col_loop_var = (WIN_SZ >> 1);
} else {
col_loop_var = 1;
}
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {//3
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS unroll
for (int ext_copy = 0; ext_copy < npc + WIN_SZ - 1; ext_copy++) {//8+3-1
#pragma HLS unroll
src_buf[extract_px][ext_copy] = 0;
}
}
Col_Loop:
// for (ap_uint<13> col = 0; col < 5; col++) {
// for (ap_uint<13> col = 0; col < (32 >> 3) + 1; col++) {
for (ap_uint<13> col = 0; col < ((img_width) >> XF_BITSHIFT(NPC)) + col_loop_var; col++) {
#pragma HLS LOOP_TRIPCOUNT min=TC max=TC
#pragma HLS pipeline
#pragma HLS LOOP_FLATTEN OFF
if (row < img_height && col < (img_width >> XF_BITSHIFT(NPC)))
//buf[2+?][col] = _src_mat.read(read_index++); // Read data
buf[row_ind[win_size - 1]][col] = _src_mat.read(read_index++); // Read data
if (NPC == XF_NPPC8) {
for (int copy_buf_var = 0; copy_buf_var < WIN_SZ; copy_buf_var++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS UNROLL
// if ((row > (img_height - 1)) && (copy_buf_var > (2 - (row - (img_height - 1)))))//未到底?
if ((row > (img_height - 1)) && (copy_buf_var > (win_size - 1 - (row - (img_height - 1))))) {
buf_cop[copy_buf_var] = buf[(row_ind[win_size - 1 - (row - (img_height - 1))])][col];
} else {
if (col < (img_width >> XF_BITSHIFT(NPC)))
buf_cop[copy_buf_var] = buf[(row_ind[copy_buf_var])][col];
// else
// buf_cop[copy_buf_var] = buf_cop[copy_buf_var];
}
}//copy buf to buf_cop
// ap_uint<8> src_buf_temp_copy[3][8]
// ap_uint<8> src_buf_temp_copy_extract[8]
XF_PTNAME(DEPTH) src_buf_temp_copy[WIN_SZ][XF_NPIXPERCYCLE(NPC)];
XF_PTNAME(DEPTH) src_buf_temp_copy_extract[XF_NPIXPERCYCLE(NPC)];
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
#pragma HLS unroll
// ap_uint<64> toextract
XF_SNAME(WORDWIDTH) toextract = buf_cop[extract_px];
xfExtractPixels<NPC, WORDWIDTH, DEPTH>(src_buf_temp_copy_extract, toextract, 0);
for (int ext_copy = 0; ext_copy < npc; ext_copy++) {
#pragma HLS unroll
// src_buf_temp_copy[3][8]
src_buf_temp_copy[extract_px][ext_copy] = src_buf_temp_copy_extract[ext_copy];
}
}
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
for (int col_warp = 0; col_warp < (WIN_SZ >> 1); col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
if (col == img_width >> XF_BITSHIFT(NPC)) {//在 src_buf_temp_copy 上补右边界
src_buf[extract_px][col_warp + npc + (WIN_SZ >> 1)] =
src_buf[extract_px][npc + (WIN_SZ >> 1) - 1];
} else {
//ap_uint<8> src_buf[3][8+2]
// src_buf[0:2][0 + 8 + 1)]= src_buf_temp_copy[0:2][col_warp];
src_buf[extract_px][col_warp + npc + (WIN_SZ >> 1)] = src_buf_temp_copy[extract_px][col_warp];
}
}
}
if (col == 0) {//在 src_buf_temp_copy 上补左边界
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
// for (int col_warp = 0; col_warp < 8 + 1; col_warp++)
for (int col_warp = 0; col_warp < npc + (WIN_SZ >> 1); col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
// src_buf[0:2][0:8] = src_buf_temp_copy[0:2][0];
src_buf[extract_px][col_warp] = src_buf_temp_copy[extract_px][0];
}
}
}
// ap_uint<8> src_buf_temp_med_apply[3][8+2]
XF_PTNAME(DEPTH) src_buf_temp_med_apply[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)];
// for (int applyfast = 0; applyfast < 8; applyfast++) {//八个并行
for (int applyfast = 0; applyfast < npc; applyfast++) {
#pragma HLS UNROLL
for (int copyi = 0; copyi < WIN_SZ; copyi++) {//get 3*3
for (int copyj = 0; copyj < WIN_SZ; copyj++) {
src_buf_temp_med_apply[copyi][copyj] = src_buf[copyi][copyj + applyfast];
}
}
// ap_uint<8> OutputValues_percycle[8]
XF_PTNAME(DEPTH) OutputValues_percycle[XF_NPIXPERCYCLE(NPC)];
xFnmsProc<NPC, DEPTH, WIN_SZ, WIN_SZ_SQ>(OutputValues_percycle, src_buf_temp_med_apply, WIN_SZ);
OutputValues[applyfast] = OutputValues_percycle[0];
}
if (col >= 1) {//打包 & 保存
shift_x = 0;
P0 = 0;
xfPackPixels<NPC, WORDWIDTH, DEPTH>(OutputValues, P0, 0, npc, shift_x);
_out_mat.write(write_index++, P0);
}
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
for (int col_warp = 0; col_warp < (WIN_SZ >> 1); col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
src_buf[extract_px][col_warp] = src_buf[extract_px][col_warp + npc];
}
}
for (int extract_px = 0; extract_px < WIN_SZ; extract_px++) {
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
for (int col_warp = 0; col_warp < npc; col_warp++) {
#pragma HLS UNROLL
#pragma HLS LOOP_TRIPCOUNT min=WIN_SZ max=WIN_SZ
src_buf[extract_px][col_warp + (WIN_SZ >> 1)] = src_buf_temp_copy[extract_px][col_warp];
}
}
}
} // Col_Loop
}
template <int NPC, int DEPTH, int WIN_SZ, int WIN_SZ_SQ>
void xFnmsProc(XF_PTNAME(DEPTH) OutputValues[XF_NPIXPERCYCLE(NPC)],
XF_PTNAME(DEPTH) src_buf[WIN_SZ][XF_NPIXPERCYCLE(NPC) + (WIN_SZ - 1)],
ap_uint<8> win_size) {
#pragma HLS INLINE
XF_PTNAME(DEPTH) pix;
// Comparing scores of the candidate pixel with neighbors in a 3x3 window
if (src_buf[1][1] != 0) { // if score of candidate pixel != 0
if ((src_buf[1][1] > src_buf[1][0]) && (src_buf[1][1] > src_buf[1][2]) && (src_buf[1][1] > src_buf[0][0]) &&
(src_buf[1][1] > src_buf[0][1]) && (src_buf[1][1] > src_buf[0][2]) && (src_buf[1][1] > src_buf[2][0]) &&
(src_buf[1][1] > src_buf[2][1]) && (src_buf[1][1] > src_buf[2][2])) {
pix = 255;
} else {
pix = 0;
}
} else {
pix = 0;
}
OutputValues[0] = pix; // array[(WIN_SZ_SQ)>>1];
return;
}