for(int j = 0; j < dim; j++) {
#pragma HLS PIPELINE
#pragma HLS LOOP_TRIPCOUNT min=c_size_min max=c_size_max
out[j] = in1[j] * in2[j];
//dim 40960
使用#pragma SDS data zero_copy(in1[0:dim], in2[0:dim], out[0:dim])后的优化
在complexf-1中
//loop tripcount constant
const int c_size = DATA_SIZE;//64
//Local memory to store input and output matrices
int local_in1[MAX_SIZE][MAX_SIZE];//64*64
int local_in2[MAX_SIZE][MAX_SIZE];
int local_out[MAX_SIZE][MAX_SIZE];
//Physical implementation of memories have only a limited number of read/write
//ports, that can be overcome by using the ARRAY_PARTITION pragma
// #pragma HLS ARRAY_PARTITION variable=local_in1 complete dim=2
// #pragma HLS ARRAY_PARTITION variable=local_in2 complete dim=1
//When loop_3 is unrolled automatically, the column "k" in local_in1[i][k] is variable
//whereas in case of local_in2[k][j] the row "k" is variable. So, for effective pipelined
//processing, local_in1 has been partitioned in dimension 2 and local_in2 is
//partitioned in dimension 1.
//Burst read on input matrices local_in1 and local_in2 from DDR memory.
read_in: for(int iter = 0, i=0, j=0; iter< dim*dim; iter++,j++){
#pragma HLS PIPELINE
#pragma HLS LOOP_TRIPCOUNT min=c_size*c_size max=c_size*c_size
if( j== dim){ j = 0; i++; }
local_in1[i][j] = in1[iter].real();
local_in2[i][j] = in2[iter];
}
//Reads the input_data from local memory, performs the
//computations and writes the data to local memory.
loop_1: for (int i = 0 ; i < dim ; i++){
#pragma HLS LOOP_TRIPCOUNT min=c_size max=c_size
loop_2: for(int j = 0 ; j < dim ; j++){
#pragma HLS LOOP_TRIPCOUNT min=c_size max=c_size
//Pipelining a loop results in automatic unrolling of inner loops by the HLS compiler.
#pragma HLS PIPELINE
int res = 0;
loop_3: for(int k = 0; k < c_size; k++){
//#pragma HLS PIPELINE 12/15/3808/2401
//To enable automatic unrolling of loop, the no. of iterations
//need to be a compile time constant, so 'c_size' is specified
//here instead of 'dim', which is not a compile time constant.
res += local_in1[i][k]*local_in2[k][j];
}
local_out[i][j] = res;
当改成640时报错,BRAM不够