矩阵乘法的代码如下
void matrixmul(int A[N][M], int B[M][P], int AB[N][P]) {
#pragma HLS ARRAY_RESHAPE variable=A complete dim=2
#pragma HLS ARRAY_RESHAPE variable=B complete dim=1
/* for each row and column of AB */
row: for(int i = 0; i < N; ++i) {
col: for(int j = 0; j < P; ++j) {
#pragma HLS PIPELINE II=1
/* compute (AB)i,j */
int ABij = 0;
product: for(int k = 0; k < M; ++k) {
ABij += A[i][k] * B[k][j];
}
AB[i][j] = ABij;
}
}
}
这次把pipeline放在row的循环:
为了优化对乘法进行优化操作
代码如下
头文件
#ifndef _BLOCK_MM_H_
#define _BLOCK_MM_H_
#include "hls_stream.h"
#include <iostream>
#include <iomanip>
#include <vector>
using namespace std;
typedef int DTYPE;
const int SIZE = 8;
const int BLOCK_SIZE = 4;
typedef struct { DTYPE a[BLOCK_SIZE]; } blockvec;
typedef struct { DTYPE out[BLOCK_SIZE][BLOCK_SIZE]; } blockmat;
void blockmatmul(hls::stream<blockvec> &Arows, hls::stream<blockvec> &Bcols,
blockmat & ABpartial, DTYPE iteration);
#endif
主代码
#include "block_mm.h"
void blockmatmul(hls::stream<blockvec> &Arows, hls::stream<blockvec> &Bcols,
blockmat &ABpartial, int it) {
#pragma HLS DATAFLOW
int counter = it % (SIZE/BLOCK_SIZE);
static DTYPE A[BLOCK_SIZE][SIZE];
if(counter == 0){ //only load the A rows when necessary
loadA: for(int i = 0; i < SIZE; i++) {
blockvec tempA = Arows.read();
for(int j = 0; j < BLOCK_SIZE; j++) {
#pragma HLS PIPELINE II=1
A[j][i] = tempA.a[j];
}
}
}
DTYPE AB[BLOCK_SIZE][BLOCK_SIZE] = { 0 };
partialsum: for(int k=0; k < SIZE; k++) {
blockvec tempB = Bcols.read();
for(int i = 0; i < BLOCK_SIZE; i++) {
for(int j = 0; j < BLOCK_SIZE; j++) {
AB[i][j] = AB[i][j] + A[i][k] * tempB.a[j];
}
}
}
writeoutput: for(int i = 0; i < BLOCK_SIZE; i++) {
for(int j = 0; j < BLOCK_SIZE; j++) {
ABpartial.out[i][j] = AB[i][j];
}
}
}
testbench
#include "block_mm.h"
#include <stdlib.h>
using namespace std;
void matmatmul_sw(DTYPE A[SIZE][SIZE], DTYPE B[SIZE][SIZE],
DTYPE out[SIZE][SIZE]){
DTYPE sum = 0;
for(int i = 0; i < SIZE; i++){
for(int j = 0;j<SIZE; j++){
sum = 0;
for(int k = 0; k < SIZE; k++){
sum = sum + A[i][k] * B[k][j];
}
out[i][j] = sum;
}
}
}
int main() {
int fail = 0;
hls::stream<blockvec> strm_matrix1("strm_matrix1");
hls::stream<blockvec> strm_matrix2("strm_matrix2");
blockvec strm_matrix1_element, strm_matrix2_element;
blockmat block_out;
DTYPE A[SIZE][SIZE], B[SIZE][SIZE];
DTYPE matrix_swout[SIZE][SIZE], matrix_hwout[SIZE][SIZE];
initmatrices: for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
A[i][j] = rand() % 512;
B[i][j] = rand() % 512;
matrix_swout[i][j] = 0;
matrix_hwout[i][j] = 0;
}
int row, col, it = 0;
for(int it1 = 0; it1 < SIZE; it1 = it1 + BLOCK_SIZE) {
for(int it2 = 0; it2 < SIZE; it2 = it2 + BLOCK_SIZE) {
row = it1; //row + BLOCK_SIZE * factor_row;
col = it2; //col + BLOCK_SIZE * factor_col;
for(int k = 0; k < SIZE; k++) {
for(int i = 0; i < BLOCK_SIZE; i++) {
if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1_element.a[i] = A[row+i][k];
strm_matrix2_element.a[i] = B[k][col+i];
}
if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1.write(strm_matrix1_element);
strm_matrix2.write(strm_matrix2_element);
}
blockmatmul(strm_matrix1, strm_matrix2, block_out, it);
for(int i = 0; i < BLOCK_SIZE; i++)
for(int j = 0; j < BLOCK_SIZE; j++)
matrix_hwout[row+i][col+j] = block_out.out[i][j];
it = it + 1;
}
}
matmatmul_sw(A, B, matrix_swout);
for(int i = 0; i<SIZE; i++)
for(int j = 0; j<SIZE; j++)
if(matrix_swout[i][j] != matrix_hwout[i][j]) { fail=1; }
if(fail==1) cout << "failed" << endl;
else cout << "passed" << endl;
return 0;
}
}