1)crop
先来看mat中的crop。
template<int TYPE, int BPP, int ROWS, int COLS, int NPPC>
void xfMat_crop(xf::Mat<TYPE, ROWS, COLS, NPPC>& src, xf::Mat<TYPE, ROWS, COLS, NPPC>& dst, u16 crop_x, u16 crop_y)
{
u16 rows = src.rows;
u16 cols = src.cols;
u16 min_col = crop_x ;
u16 max_col = crop_x + dst.cols;
u16 min_row = crop_y;
u16 max_row = crop_y + dst.rows;
T_UINT(BPP, NPPC) srcpixel;
T_UINT(BPP, NPPC) dstpixel;
int src_idx = 0, dst_idx = 0;
for (u16 i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (u16 j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
#pragma HLS loop_flatten off
srcpixel = src.read(src_idx++);
dstpixel = srcpixel;
bool xn = (j >= min_col) ? true : false;
bool xp = (j < max_col) ? true : false;
bool yn = (i >= min_row) ? true : false;
bool yp = (i < max_row) ? true : false;
if (xn && xp && yn && yp) {
dst.write(dst_idx++, dstpixel);
}
}
}
}
输入源对象是mat,输出目的对象也是mat。
首先从src对象中获取属性,得到rows和cols。
然后定义几个局部变量,作为操作的中间寄存对象。
然后在两层嵌套的for循环体中,进行逐像素处理。
判断是否属于ROI的像素,如果是,则写入目的对象mat,否则忽略。
这里,主要使用了mat对象的read和write操作集。
再来看看stream中的crop。
template<int BPP, int ROWS, int COLS, int NPPC>
void stream_crop(hls::stream<T_UINT(BPP, NPPC)>& src, hls::stream<T_UINT(BPP, NPPC)>& dst, int width, int height, int crop_x, int crop_y, int crop_width, int crop_height)
{
const int rows = height;
const int cols = width / NPPC;
const int min_col = crop_x / NPPC;
const int max_col = (crop_x + crop_width) / NPPC;
const int min_row = crop_y;
const int max_row = crop_y + crop_height;
const unsigned int step = BPP * NPPC;
const unsigned char sel = (crop_x % NPPC) * BPP;
T_UINT(BPP, NPPC) srcpixel;
T_UINT(BPP, NPPC * 2) pixel_buffer ;
T_UINT(BPP, NPPC) dstpixel;
for (int i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (int j = 0; j < cols + 1; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
if (j < cols)
{
srcpixel = src.read();
}
pixel_buffer = pixel_buffer >> step;
pixel_buffer(2 * step - 1, step) = srcpixel;
if (j >= (min_col + 1) && (j < (max_col + 1)) && (i >= min_row) && (i < max_row))
{
dstpixel = pixel_buffer(step + sel - 1, sel);
dst << dstpixel;
}
}
}
}
输入源对象是stream,输出目的对象也是stream。
由于stream不再像mat一样,具有row和col属性, 所以需要传入参数来指定,即width和height。
在两层嵌套的for循环体中,进行逐像素处理。
判断是否属于ROI的像素,如果是,则写入目的对象mat,否则忽略。
这里,主要使用了stream对象的read和write操作集。
注意:
HLS中,扩展了C++语法,支持类似于VHDL的位向量截取操作。
如上,pixel_buffer可以截取出位向量,作为一个部分操作数。
++++++++++++++++++++++++++++++++++++++++++++++++++
2)skip_pixel
template<int TYPE, int BPP, int ROWS, int COLS, int NPPC>
void xfMat_skip_pixel(xf::Mat<TYPE, ROWS, COLS, NPPC>& src, xf::Mat<TYPE, ROWS, COLS, NPPC>& dst)
{
u16 rows = src.rows;
u16 cols = src.cols;
T_UINT(BPP, NPPC) srcpixel;
T_UINT(BPP, NPPC) dstpixel;
int src_idx = 0, dst_idx = 0;
for (u16 i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (u16 j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
#pragma HLS loop_flatten off
srcpixel = src.read(src_idx++);
dstpixel = srcpixel;
if ((j & 0x1) == 0) {
dst.write(dst_idx++, dstpixel);
}
}
}
}
输入源对象是mat,输出目的对象也是mat。
在两层嵌套的for循环体中,进行逐像素处理。
判断列号的奇偶,如果是偶数,则写入目的对象mat,如果是奇数则忽略。
这里,主要使用了mat对象的read和write操作集。
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3)demux
template<int BPP, int ROWS, int COLS, int NPPC, int G_BPP>
void stream_demux_rgb_gray(hls::stream<T_UINT(BPP, NPPC)>& src, hls::stream<T_UINT(BPP, NPPC)>& dst0, hls::stream<T_UINT(G_BPP, NPPC)>& dst1, u16 width, u16 height)
{
assert(((NPPC==1)) && "Only 1 pixel-parallelism are supported");
u16 rows = height;
u16 cols = width ;
T_UINT(BPP, NPPC) srcpixel;
ROWS_LOOP: for (u16 i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
#pragma HLS loop_flatten off
COLS_LOOP: for (u16 j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
src >> srcpixel;
u8 r = srcpixel(7, 0);
u8 g = srcpixel(15, 8);
u8 b = srcpixel(23, 16);
int yval = (r * 306 + g * 601 + b * 117) / 1024;
yval = (yval > 255) ? 255 : yval;
T_UINT(G_BPP, NPPC) gray = yval;
dst0 << srcpixel;
dst1 << gray;
}
}
}
输入源对象是stream,输出目的对象也是stream。
在两层嵌套的for循环体中,进行逐像素处理。
逐像素进行灰度变换后,分别将RGB图像和GRAY图像输出到ds0和ds1中去。
++++++++++++++++++++++++++++++++++++++++++++++++++++++
4)duplicate
template<int TYPE, int ROWS, int COLS, int NPPC>
void xfMat_duplicate(xf::Mat<TYPE, ROWS, COLS, NPPC>& src, xf::Mat<TYPE, ROWS, COLS, NPPC>& dst0, xf::Mat<TYPE, ROWS, COLS, NPPC>& dst1)
{
u16 rows = src.rows;
u16 cols = src.cols;
u32 src_idx = 0, dst0_idx = 0, dst1_idx = 0;
for (u16 i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (u16 j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
#pragma HLS loop_flatten off
u8 srcpixel = src.read(src_idx++);
dst0.write(dst0_idx++, srcpixel);
dst1.write(dst1_idx++, srcpixel);
}
}
}
输入源对象是mat,输出目的对象也是mat。
在两层嵌套的for循环体中,进行逐像素处理。
将同一个图像,同时输出给ds0和ds1中去,实现复制。
++++++++++++++++++++++++++++++++++++++++++++++++++
5)split
先来看看mat中的split。
template<int SRC_T, int BPP, int ROWS, int COLS, int NPPC, int DST_T, int DST_BPP>
void xfMat_split(xf::Mat<SRC_T, ROWS, COLS, NPPC>& src, xf::Mat<DST_T, ROWS, COLS, NPPC>& dst0, xf::Mat<DST_T, ROWS, COLS, NPPC>& dst1, xf::Mat<DST_T, ROWS, COLS, NPPC>& dst2)
{
u16 rows = src.rows;
u16 cols = src.cols;
const u8 step = DST_BPP;
T_UINT(BPP, NPPC) srcpixel;
T_UINT(DST_BPP, NPPC) dst0pixel;
T_UINT(DST_BPP, NPPC) dst1pixel;
T_UINT(DST_BPP, NPPC) dst2pixel;
int src_idx = 0, dst0_idx = 0, dst1_idx = 0, dst2_idx = 0;
u16 c = 0;
for (u16 i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (u16 j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
#pragma HLS loop_flatten off
srcpixel = src.read(src_idx++);
dst0pixel = srcpixel(step - 1, 0);
dst1pixel = srcpixel(step * 2 - 1, step);
dst2pixel = srcpixel(step* 3 - 1, step * 2);
dst0.write(dst0_idx++, dst0pixel);
dst1.write(dst1_idx++, dst1pixel);
dst2.write(dst2_idx++, dst2pixel);
}
}
}
输入源对象是mat,输出目的对象也是mat。
在两层嵌套的for循环体中,进行逐像素处理。
将输入的像素进行位向量截取后,分别输出到ds0,ds1,ds2中,实现图像分割。
再来看看stream中的split。
template<int BPP, int ROWS, int COLS, int NPPC>
void stream_split(hls::stream<T_UINT(BPP, NPPC)>& src, hls::stream<T_UINT(BPP, NPPC/2)>& dst0, hls::stream<T_UINT(BPP, NPPC/2)>& dst1, hls::stream<T_UINT(BPP, NPPC/2)>& dst2, hls::stream<T_UINT(BPP, NPPC/2)>& dst3, int width, int height)
{
const int rows = height;
const int cols = width / NPPC;
T_UINT(BPP, NPPC) srcpixel;
T_UINT(BPP, NPPC/2) dst0pixel;
T_UINT(BPP, NPPC/2) dst1pixel;
T_UINT(BPP, NPPC/2) dst2pixel;
T_UINT(BPP, NPPC/2) dst3pixel;
for (int i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (int j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
src >> srcpixel;
if ((i % 2) == 0)
{
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst0pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 1) - 1, BPP * k * 2);
dst1pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1));
}
dst0 << dst0pixel;
dst1 << dst1pixel;
}
else
{
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst2pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 1) - 1, BPP * k * 2);
dst3pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1));
}
dst2 << dst2pixel;
dst3 << dst3pixel;
}
}
}
}
输入源对象是stream,输出目的对象也是stream。
由于stream不再像mat一样,具有row和col属性, 所以需要传入参数来指定,即width和height。
在三层嵌套的for循环体中,进行逐像素处理。
这里,主要使用了stream中重载的操作符"<<“和”>>"。
再来看看输入stream,输出mat的split。
template<int TYPE, int BPP, int ROWS, int COLS, int NPPC>
void stream_xfMat_split(hls::stream<T_UINT(BPP, NPPC)>& src, xf::Mat<TYPE, ROWS/2, COLS/2, NPPC/2>& dst0, xf::Mat<TYPE, ROWS/2, COLS/2, NPPC/2>& dst1, xf::Mat<TYPE, ROWS/2, COLS/2, NPPC/2>& dst2, xf::Mat<TYPE, ROWS/2, COLS/2, NPPC/2>& dst3)
{
const int rows = dst0.rows * 2;
const int cols = dst0.cols * 2 / NPPC;
T_UINT(BPP, NPPC) srcpixel;
T_UINT(BPP, NPPC/2) dst0pixel;
T_UINT(BPP, NPPC/2) dst1pixel;
T_UINT(BPP, NPPC/2) dst2pixel;
T_UINT(BPP, NPPC/2) dst3pixel;
int dst0_idx = 0;
int dst1_idx = 0;
int dst2_idx = 0;
int dst3_idx = 0;
for (int i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (int j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
src >> srcpixel;
if ((i % 2) == 0)
{
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst0pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 1) - 1, BPP * k * 2);
dst1pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1));
}
dst0.write(dst0_idx++, dst0pixel);
dst1.write(dst1_idx++, dst1pixel);
}
else
{
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst2pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 1) - 1, BPP * k * 2);
dst3pixel(BPP * (k + 1) - 1, BPP * k) = srcpixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1));
}
dst2.write(dst2_idx++, dst2pixel);
dst3.write(dst3_idx++, dst3pixel);
}
}
}
}
输入源对象是stream,输出目的对象是mat。
由于stream不再像mat一样,具有row和col属性, 所以需要传入参数来指定,即width和height。
在三层嵌套的for循环体中,进行逐像素处理。
这里,主要使用了stream中重载的操作符"<<“和”>>"。以及mat中的操作集write。
++++++++++++++++++++++++++++++++++++++++
6)merge
先来看看mat中的merge。
template<int SRC_T, int BPP, int ROWS, int COLS, int NPPC, int DST_T, int DST_BPP>
void xfMat_merge(xf::Mat<SRC_T, ROWS, COLS, NPPC>& src0, xf::Mat<SRC_T, ROWS, COLS, NPPC>& src1, xf::Mat<SRC_T, ROWS, COLS, NPPC>& src2, xf::Mat<DST_T, ROWS, COLS, NPPC>& dst)
{
u16 rows = dst.rows;
u16 cols = dst.cols;
const u8 step = BPP;
T_UINT(BPP, NPPC) src0pixel;
T_UINT(BPP, NPPC) src1pixel;
T_UINT(BPP, NPPC) src2pixel;
T_UINT(DST_BPP, NPPC) dstpixel;
int src0_idx = 0, src1_idx = 0, src2_idx = 0, dst_idx = 0;
u16 c = 0;
for (u16 i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (u16 j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
#pragma HLS loop_flatten off
src0pixel = src0.read(src0_idx++);
src1pixel = src1.read(src1_idx++);
src2pixel = src2.read(src2_idx++);
dstpixel(step - 1, 0) = src0pixel;
dstpixel(step * 2 - 1, step) = src1pixel;
dstpixel(step* 3 - 1, step * 2) = src2pixel;
dst.write(dst_idx++, dstpixel);
}
}
}
输入源对象是mat,输出目的对象也是mat。
在两层嵌套的for循环体中,进行逐像素处理。
将输入的像素进行位向量拼接后,输出到dst中,实现图像合并。
再来看看stream中的merge。
template<int BPP, int ROWS, int COLS, int NPPC>
void stream_merge(hls::stream<T_UINT(BPP, NPPC/2)>& src0, hls::stream<T_UINT(BPP, NPPC/2)>& src1, hls::stream<T_UINT(BPP, NPPC/2)>& src2, hls::stream<T_UINT(BPP, NPPC/2)>& src3, hls::stream<T_UINT(BPP, NPPC)>& dst, int width, int height)
{
const int rows = height;
const int cols = width / NPPC;
T_UINT(BPP, NPPC/2) src0pixel;
T_UINT(BPP, NPPC/2) src1pixel;
T_UINT(BPP, NPPC/2) src2pixel;
T_UINT(BPP, NPPC/2) src3pixel;
T_UINT(BPP, NPPC) dst0pixel;
T_UINT(BPP, NPPC) dst1pixel;
for (int i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (int j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
if ((i % 2) == 0)
{
src0 >> src0pixel;
src1 >> src1pixel;
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst0pixel(BPP * (k * 2 + 1) - 1, BPP * k * 2) = src0pixel(BPP * (k + 1) - 1, BPP * k);
dst0pixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1)) = src1pixel(BPP * (k + 1) - 1, BPP * k);
}
dst << dst0pixel;
}
else
{
src2 >> src2pixel;
src3 >> src3pixel;
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst1pixel(BPP * (k * 2 + 1) - 1, BPP * k * 2) = src2pixel(BPP * (k + 1) - 1, BPP * k);
dst1pixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1)) = src3pixel(BPP * (k + 1) - 1, BPP * k);
}
dst << dst1pixel;
}
}
}
}
输入源对象是stream,输出目的对象也是stream。
由于stream不再像mat一样,具有row和col属性, 所以需要传入参数来指定,即width和height。
在三层嵌套的for循环体中,进行逐像素处理。
这里,主要使用了stream中重载的操作符"<<“和”>>"。
再来看看输入mat,输出stream的merge。
template<int TYPE, int BPP, int ROWS, int COLS, int NPPC>
void xfMat_stream_merge(xf::Mat<TYPE, ROWS, COLS, NPPC/2>& src0, xf::Mat<TYPE, ROWS, COLS, NPPC/2>& src1, xf::Mat<TYPE, ROWS, COLS, NPPC/2>& src2, xf::Mat<TYPE, ROWS, COLS, NPPC/2>& src3, hls::stream<T_UINT(BPP, NPPC)>& dst)
{
const int rows = src0.rows * 2;
const int cols = src0.cols * 2 / NPPC;
T_UINT(BPP, NPPC/2) src0pixel;
T_UINT(BPP, NPPC/2) src1pixel;
T_UINT(BPP, NPPC/2) src2pixel;
T_UINT(BPP, NPPC/2) src3pixel;
T_UINT(BPP, NPPC) dst0pixel;
T_UINT(BPP, NPPC) dst1pixel;
int src0_idx = 0;
int src1_idx = 0;
int src2_idx = 0;
int src3_idx = 0;
for (int i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (int j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
if ((i % 2) == 0)
{
src0pixel = src0.read(src0_idx++);
src1pixel = src1.read(src1_idx++);
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst0pixel(BPP * (k * 2 + 1) - 1, BPP * k * 2) = src0pixel(BPP * (k + 1) - 1, BPP * k);
dst0pixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1)) = src1pixel(BPP * (k + 1) - 1, BPP * k);
}
dst << dst0pixel;
}
else
{
src2pixel = src2.read(src2_idx++);
src3pixel = src3.read(src3_idx++);
for (int k = 0; k < NPPC/2; k++)
{
#pragma HLS unroll
dst1pixel(BPP * (k * 2 + 1) - 1, BPP * k * 2) = src2pixel(BPP * (k + 1) - 1, BPP * k);
dst1pixel(BPP * (k * 2 + 2) - 1, BPP * (k * 2 + 1)) = src3pixel(BPP * (k + 1) - 1, BPP * k);
}
dst << dst1pixel;
}
}
}
}
输入源对象是mat,输出目的对象是stream。
在三层嵌套的for循环体中,进行逐像素处理。
这里,主要使用了stream中重载的操作符"<<“和”>>"。以及mat中的操作集write。
+++++++++++++++++++++++++++++++++++++++++++++++++
7) horizon flip
先来看看mat的hflip。
template<int TYPE, int BPP, int ROWS, int COLS, int NPPC>
void xfMat_hflip(xf::Mat<TYPE, ROWS, COLS, NPPC>& src, xf::Mat<TYPE, ROWS, COLS, NPPC>& dst, ap_uint<1> hflip_mode)
{
u16 rows = src.rows;
u16 cols = src.cols;
T_UINT(BPP, NPPC) line_buffer[2][COLS/NPPC];
#pragma HLS ARRAY_PARTITION variable=line_buffer complete dim=1
T_UINT(BPP, NPPC) srcpixel, dstpixel;
int src_idx = 0, dst_idx = 0;
for (u16 i = 0; i < rows + 1; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (u16 j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS/NPPC max = COLS/NPPC
#pragma HLS loop_flatten off
if (i < rows)
{
srcpixel = src.read(src_idx++);
if ((i % 2) == 0) {
line_buffer[0][j] = srcpixel;
} else {
line_buffer[1][j] = srcpixel;
}
}
if (i > 0)
{
u16 m = (hflip_mode == 0) ? j : (cols - j - 1);
if (((i + 1) % 2) == 0) {
dstpixel = line_buffer[0][m];
} else {
dstpixel = line_buffer[1][m];
}
dst.write(dst_idx++, dstpixel);
}
}
}
}
输入源对象是mat,输出目的对象是mat。
定义了局部变量line_buffer,作为中间寄存对象。这是一个二维数组,可以存储两行。
是为了实现乒乓操作而设计的。
在两层嵌套for循环体中,逐像素进行处理。
用两个代码块,分别对linebuffer的不同区域进行处理,从而实现乒乓操作。
再来看看stream的hflip。
template<int BPP, int ROWS, int COLS, int NPPC>
void stream_hflip(hls::stream<T_UINT(BPP, NPPC)>& src, hls::stream<T_UINT(BPP, NPPC)>& dst, unsigned char flip_mode, int width, int height)
{
int rows = height;
int cols = width / NPPC;
T_UINT(BPP, NPPC) line_buffer[2][COLS / NPPC];
#pragma HLS RESOURCE variable=line_buffer core=RAM_S2P_BRAM
#pragma HLS ARRAY_PARTITION variable=line_buffer complete dim=1
//first line
{
for (unsigned int j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS max = COLS
T_UINT(BPP, NPPC) srcpixel;
src >> srcpixel;
int ox = ((flip_mode & 0x1) != 0) ? (cols - j - 1) : j;
line_buffer[0][ox] = ((flip_mode & 0x1) != 0) ? switch_bits<BPP, NPPC>(srcpixel) : srcpixel;
}
}
//middle lines
for (unsigned int i = 1; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
for (unsigned int j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS max = COLS
#pragma HLS dependence variable=line_buffer inter false
T_UINT(BPP, NPPC) srcpixel;
src >> srcpixel;
int ox = ((flip_mode & 0x1) != 0) ? (cols - j - 1) : j;
line_buffer[i % 2][ox] = ((flip_mode & 0x1) != 0) ? switch_bits<BPP, NPPC>(srcpixel) : srcpixel;
T_UINT(BPP, NPPC) dstpixel = line_buffer[(i - 1) % 2][j];
dst << dstpixel;
}
}
//last line
{
for (unsigned int j = 0; j < cols; j++)
{
#pragma HLS pipeline II = 1
#pragma HLS loop_tripcount avg = COLS max = COLS
T_UINT(BPP, NPPC) dstpixel = line_buffer[(rows - 1) % 2][j];
dst << dstpixel;
}
}
}
输入源对象是stream,输出目的对象是stream。
由于stream不再像mat一样,具有row和col属性, 所以需要传入参数来指定,即width和height。
定义了局部变量line_buffer,作为中间寄存对象。这是一个二维数组,可以存储两行。
是为了实现乒乓操作而设计的。
这里,写法与上述不同,是为了体现出不同的写法也能实现同样的功能。
代码块分为三大块,
首行写入linebuffer,不读取。
中间行写入linebuffer,同时读取之前缓存的数据,进行输出。
尾行不写入linebuffer,只读取之前缓存的数据,进行输出。
这里使用了一个辅助函数,switch_bits,
template<int BPP, int NPPC>
T_UINT(BPP, NPPC) switch_bits(T_UINT(BPP, NPPC) pixel)
{
#pragma HLS unroll
T_UINT(BPP, NPPC) val;
for (unsigned char k = 0; k < NPPC; k++)
{
val(BPP * (k + 1) - 1, BPP * k) = pixel(BPP * (NPPC - k) - 1, BPP * (NPPC - k - 1));
}
return val;
}
+++++++++++++++++++++++++++++++++++++++++++++++++++
8)vertical flip
先来看看mm2s_dma
template<int BPP, int NPPC, int BURST_WORD>
void mm2s_dma(T_UINT(BPP, NPPC)* src, hls::stream<T_UINT(BPP, NPPC)>& dst, int& n)
{
T_UINT(BPP, NPPC) line_buffer[BURST_WORD];
#pragma HLS DATAFLOW
T_UINT(BPP, NPPC)* rptr = (T_UINT(BPP, NPPC)*)&src[n];
for (int k = 0; k < BURST_WORD; k++)
{
#pragma HLS loop_tripcount avg = BURST_WORD max = BURST_WORD
#pragma HLS pipeline II = 1
line_buffer[k] = rptr[k];
}
for (int k = 0; k < BURST_WORD; k++)
{
#pragma HLS loop_tripcount avg = BURST_WORD max = BURST_WORD
#pragma HLS pipeline II = 1
T_UINT(BPP, NPPC) srcpixel = line_buffer[k];
dst << srcpixel;
}
n += BURST_WORD;
}
输入时AXIMM,输出是stream,以burst_word为一笔传输的长度进行操作。
定义了一个linebuffer,具有burst_word的长度。
首先从AXIMM中,读取一笔数据,存储到linebuffer中。
然后将linebuffer中的一笔数据,输出到stream中去。
基于此,来看看vflip。
template<int BPP, int ROWS, int COLS, int NPPC, int BURST_WORD>
void axim2stram_vflip(T_UINT(BPP, NPPC)* src, hls::stream<T_UINT(BPP, NPPC)>& dst, unsigned char flip_mode, int width, int height)
{
const int rows = height;
const int cols = width / NPPC / BURST_WORD;
const bool mode = ((flip_mode & 0x2) != 0) ? true : false;
for (int i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
int y = mode ? (rows - i - 1) : i;
int n = y * cols * BURST_WORD ;
for (int j = 0; j < cols; j++)
{
#pragma HLS loop_tripcount avg = COLS/NPPC/BURST_WORD max = COLS/NPPC/BURST_WORD
mm2s_dma<BPP, NPPC, BURST_WORD>(src, dst, n);
}
}
}
输入是AXIMM,输出是stream。
由于输入的AXIMM是可以随机读取的,所以,只需要按照从后到前给出行地址,即可实现翻转。
基于此,再来看看vflip_vec。
template<int BPP, int ROWS, int COLS, int NPPC, int BURST_WORD, int NUMBER>
void axim2stram_vflip_vec(T_UINT(BPP, NPPC)* src, hls::stream<T_UINT(BPP, NPPC)>& dst, unsigned char index, unsigned char flip_mode, int width, int height)
{
const int rows = height;
const int cols = width / NPPC / BURST_WORD;
const int offset = width * height * (index % NUMBER) / NPPC;
const bool mode = ((flip_mode & 0x2) != 0) ? true : false;
for (int i = 0; i < rows; i++)
{
#pragma HLS loop_tripcount avg = ROWS max = ROWS
int y = mode ? (rows - i - 1) : i;
int n = y * cols * BURST_WORD + offset;
for (int j = 0; j < cols; j++)
{
#pragma HLS loop_tripcount avg = COLS/NPPC/BURST_WORD max = COLS/NPPC/BURST_WORD
mm2s_dma<BPP, NPPC, BURST_WORD>(src, dst, n);
}
}
}
区别在于offset,这个offset用于在frame_ringbuffer中找到index对应的frame。
+++++++++++++++++++++++++++++++++++++++++
9)zoom
void video_zoom(hls::stream<T_AXIU(VIDEO_BPP, VIDEO_NPPC)>& src, hls::stream<T_AXIU(VPOST_BPP, VPOST_NPPC)>& dst, u16 src_width, u16 src_height, u16 dst_width, u16 dst_height, float scale, ap_uint<1> hflip_mode)
{
#pragma HLS INTERFACE axis register both port=src
#pragma HLS INTERFACE axis register both port=dst
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port=src_width bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port=src_height bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port=dst_width bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port=dst_height bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port=scale bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port=hflip_mode bundle=CONTROL_BUS
#pragma HLS INTERFACE ap_stable port=src_width
#pragma HLS INTERFACE ap_stable port=src_height
#pragma HLS INTERFACE ap_stable port=dst_width
#pragma HLS INTERFACE ap_stable port=dst_height
#pragma HLS INTERFACE ap_stable port=scale
#pragma HLS INTERFACE ap_stable port=hflip_mode
ap_fixed<32,16> radio = scale;
u16 roi_width = dst_width * radio;
roi_width = roi_width / VIDEO_NPPC / 2 * VIDEO_NPPC * 2;
u16 roi_height = dst_height * radio;
u16 roi_x = (src_width - roi_width) / 2;
u16 roi_y = (src_height - roi_height) / 2;
printf("(%d %d) (%d %d %d %d) (%d %d)\n", src_width, src_height, roi_x, roi_y, roi_width, roi_height, dst_width, dst_height);
xf::Mat<VIDEO_TYPE, VIDEO_HEIGHT, VIDEO_WIDTH, VIDEO_NPPC> img0(src_height, src_width);
xf::Mat<VIDEO_TYPE, VIDEO_HEIGHT, VIDEO_WIDTH, VIDEO_NPPC> img1(roi_height, roi_width);
xf::Mat<VIDEO_TYPE, VPOST_HEIGHT, VPOST_WIDTH, VIDEO_NPPC> img2(dst_height, dst_width);
xf::Mat<VPOST_TYPE, VPOST_HEIGHT, VPOST_WIDTH, VPOST_NPPC> img3(dst_height, dst_width);
xf::Mat<VPOST_TYPE, VPOST_HEIGHT, VPOST_WIDTH, VPOST_NPPC> img4(dst_height, dst_width);
#pragma HLS stream variable=img0.data dim=1 depth=1024
#pragma HLS stream variable=img1.data dim=1 depth=1024
#pragma HLS stream variable=img2.data dim=1 depth=1024
#pragma HLS stream variable=img3.data dim=1 depth=1024
#pragma HLS stream variable=img4.data dim=1 depth=1024
#pragma HLS dataflow
xf::AXIvideo2xfMat(src, img0);
xfMat_crop<VIDEO_TYPE, VIDEO_BPP, VIDEO_HEIGHT, VIDEO_WIDTH, VIDEO_NPPC>(img0, img1, roi_x, roi_y);
xf::resize <XF_INTERPOLATION_BILINEAR, VIDEO_TYPE, VIDEO_HEIGHT, VIDEO_WIDTH, VPOST_HEIGHT, VPOST_WIDTH, VIDEO_NPPC, 5> (img1, img2);
xfMat_nppc_down<VIDEO_TYPE, VIDEO_BPP, VPOST_HEIGHT, VPOST_WIDTH, VIDEO_NPPC, VPOST_NPPC>(img2, img3);
xfMat_hflip<VPOST_TYPE, VPOST_BPP, VPOST_HEIGHT, VPOST_WIDTH, VPOST_NPPC>(img3, img4, hflip_mode);
xf::xfMat2AXIvideo(img4, dst);
}
该函数基于之前所定义的基础算法函数,以及xfopencv库提供的库函数,完成功能。
输入是一个AXIS的数据流,输出也是一个AXIS的数据流。
首先,通过 xf::AXIvideo2xfMat函数,将输入的AXIS转换成mat。
然后,通过xfMat_crop的具象函数,对mat进行ROI裁剪。
然后,通过xf::resize的具象函数,对mat进行双线插缩放。
然后,通过xfMat_nppc_down的具象函数,降低总线位宽。
然后,通过xfMat_hflip的具象函数,对mat进行翻转。
然后,通过xf::xfMat2AXIvideo函数,将mat转换成AXIS输出。
++++++++++++++++++++++++++++++++++++++++++