说明:由于fpga资源有限,所以120全连接层替换位60, 全连接层84替换成16。
模型构建PyTorch代码
# Load in relevant libraries, and alias where appropriate
import time
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchsummary import summary
import matplotlib.pyplot as plt
# Define relevant variables for the ML task
batch_size = 64
num_classes = 10
learning_rate = 0.001
num_epochs = 20
# Device will determine whether to run the training on GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#Loading the dataset and preprocessing
train_dataset = torchvision.datasets.MNIST(root = './data',
train = True,
transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.ToTensor(),
# transforms.Normalize(mean = (0.1307,), std = (0.3081,))
]),
download = True)
test_dataset = torchvision.datasets.MNIST(root = './data',
train = False,
transform = transforms.Compose([
transforms.Resize((32,32)),
transforms.ToTensor(),
# transforms.Normalize(mean = (0.1325,), std = (0.3105,)
]),
download=True)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
batch_size = batch_size,
shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
batch_size = batch_size,
shuffle = True)
#Defining the convolutional neural network
class LeNet5(nn.Module):
def __init__(self, num_classes):
super(LeNet5, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
nn.AvgPool2d(kernel_size = 2, stride = 2))
self.layer2 = nn.Sequential(
nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
nn.AvgPool2d(kernel_size = 2, stride = 2))
self.fc = nn.Linear(400, 60)
self.fc1 = nn.Linear(60, 16)
self.fc2 = nn.Linear(16, num_classes)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
out = self.sigmoid(out)
out = self.fc1(out)
out = self.sigmoid(out)
out = self.fc2(out)
return out
model = LeNet5(num_classes).to(device)
#Setting the loss function
cost = nn.CrossEntropyLoss()
#Setting the optimizer with the model parameters and learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#this is defined to print how many steps are remaining when training
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
#Forward pass
outputs = model(images).cuda()
loss = cost(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 400 == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images).cuda()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
#network archtecture
print(summary(LeNet5(num_classes).cuda(),input_size=[(1,32,32)]))
#save model
torch.save(model, "model.pth")
#load model
model_path = "model.pth"
model = torch.load(model_path,map_location=torch.device('cpu'))
param = {}
for name, parameters in model.cpu().state_dict().items():
param[name] = parameters.detach().numpy()
#get a batch of images
img_iter = iter(test_loader)
images = None
labels = None
for img, label in img_iter:
images = img
labels = label
break
images = images.numpy()
labels = labels.numpy()
#show the images
i = 0
fig = plt.figure(figsize=(10,10))
for label, img in zip(labels[:9], images[:9]):
i += 1
ax = fig.add_subplot(3,3,i)
ax.title.set_text("label: " + str(label))
plt.imshow(img.reshape(img.shape[1],img.shape[2]))
for label, img in zip(labels[:9], model(torch.tensor(images[:9]))):
print("The truth: ", label, "predicted label: ", img.argmax().tolist())
#test inference time
start = time.time()
pred_label = model(torch.tensor(images[:1])).argmax().tolist()
end = time.time() - start
print("The predicted label is: ", pred_label, " the inference time is: ", str(end * 1000) + "ms.")
#extract the images
for label, img in zip(labels[:9], images[:9]):
shape = img.shape
img = img.astype(np.float32)
pic_name = "input_" + str(label)
with open(pic_name + ".h", "w") as f:
new_str2 = str(img.tolist())
new_str2 = new_str2.replace("[","")
new_str2 = new_str2.replace("]","")
f.write("float " + pic_name + "[" + str(shape[1]*shape[2]) + "]" + " = {" + new_str2 + "};\n\n")
#extract the model parameters
for key in param.keys():
shape = param[key].shape
new_str1 = str(param[key].tolist())
new_str1 = new_str1.replace("[", "")
new_str1 = new_str1.replace("]", "")
if len(shape) == 1:
with open(key.replace(".", "_")+".h", "w") as f:
f.write("float " + key.replace(".", "_")+"[" + str(shape[0])+"]" + " = {"+ new_str1 + "};\n\n")
elif len(shape) == 2:
with open(key.replace(".", "_")+".h", "w") as f:
f.write("float " + key.replace(".", "_")+"[" + str(shape[0] * shape[1])+"]" + " = {"+ new_str1 + "};\n\n")
else:
with open(key.replace(".", "_")+".h", "w") as f:
f.write("float " + key.replace(".", "_")+"[" + str(shape[0] * shape[1] * shape[2] * shape[3])+"]" + " = {"+ new_str1 + "};\n\n")
print(key+" save successfully!")
hls代码
#include "HLS/hls.h"
#include "stdio.h"
float expf(float x) {
x = 1.0 + x / 1024;
x *= x; x *= x; x *= x; x *= x; x *= x;
x *= x; x *= x; x *= x; x *= x; x *= x;
return x;
}
// 输入四个图像数据,四个数据相加,求平均值得到一个结果。
float AvgPool_2x2(float input[4]){
float res = 0;
int i;
for(i = 0; i < 4 ; i++){
res += input[i];
}
res /= 4;
return res;
}
float sigmoid(float x)
{
return (1 / (1 + expf(-x)));
}
float Conv_5x5(float input[25], float kernel[25]){
int x,y;
float result = 0;
for(y = 0; y < 5; y++){
for(x = 0; x < 5; x++){
result += input[x+y*5] * kernel[x+y*5];
// result += input[x+y*5] * kernel[x*5+y];
}
}
return result;
}
//kernel 5x5x6 = 25x6 = 150
void ConvLayer_1(float input[1024],float * C1_value,float * weights,float * bias){
int i_y,i_x,matrix_y,matrix_x; // 循环变量和中间变量
int k_num,mat_i = 0; // 不同kernel的计数变量
// 定义循环名称为top_loop,方便优化
// k_num为不同卷积核的循环变量,第一层有6个不同的卷积核,自然循环6次产生6张feature map
for(int k_num = 0; k_num < 6; k_num+=1){
// 卷积核的权重数据变量matrix_2,用来存放kernel数据的
// 第一层C1有6个不同的kernel,利用外层的top_loop可以完成对这些不同的kernel进行赋值
// 5x5的kernel有25个变量,循环25次来放
float matrix_2[25];
for(mat_i = 0;mat_i<25;mat_i++){
matrix_2[mat_i] = weights[mat_i + k_num*25];
}
// 一次卷积核操作,完成28*28的输出
for(i_y = 0; i_y < 28; i_y++){
for(i_x = 0; i_x < 28; i_x++){
float matrix[25];
int pic_value_index = i_x + i_y * 32;
// 通过25次循环,先把一个点乘运算所需的输入图像数据给弄出来,放到matrix中
for(matrix_y = 0; matrix_y <5; matrix_y++){
for(matrix_x = 0; matrix_x <5; matrix_x++){
// 图片索引是0 ~ 24,这个也好理解,有25个数据
int matrix_index = matrix_x + matrix_y * 5;
// 图片像素索引 0 ~ 1024。与matrix_x,matrix_y相关,x、y=32。
int input_value_index = pic_value_index + matrix_x + matrix_y * 32;
matrix[matrix_index] = input[input_value_index];
}
}
// out_pic_index为输出数据的索引,从之前的学习,可以知道
// C1后将输出28x28x6=4704个数据,而每个数据,可以由out_pic_index索引
int out_pic_index = i_x + i_y * 28 + k_num * 784;
// 通过最基本的卷积点乘单元,输入为图像数据和卷积核值。
C1_value[out_pic_index] = Conv_5x5(matrix,matrix_2) + bias[k_num] ;//
}
}
}
}
void AvgpoolLayer_2(float input[4704],float *A2_value){
int k_num,i_y,i_x,matrix_x,matrix_y;
int count = 0;
// 有6张feature map,需要循环6次
for(k_num = 0; k_num < 6; k_num++){
// 一张feature map大小为28x28,需要把每个数据给遍历了
for(i_y = 0; i_y < 27; i_y+=2){
for(i_x = 0; i_x < 27; i_x+=2){
float matrix[4];
// 此时28x28x6某个图像数据的索引
int index_now = i_x + i_y * 28 + k_num * 784;
// 2x2的区域内,做一个平均池化
for(matrix_y = 0; matrix_y < 2; matrix_y++){
for(matrix_x = 0; matrix_x < 2; matrix_x++){
// 将输出的索引转化为输入图像数据的索引,类似一个值映射变为四个值
// 把四个索引里的值放到之前定义的matrix变量中,用于计算。
int input_index = index_now + matrix_x + matrix_y * 28 ;
matrix[matrix_x + matrix_y*2] = input[input_index];
}
}
// 将28x28个数据遍历完,计算四个产生的一个结果,具体是四个数据相加
// A2_value[count] = sigmoid(AvgPool_2x2(matrix));
A2_value[count] = AvgPool_2x2(matrix);
count++; // 计数变量增加,来完成28x28遍历下的所有输出
}
}
}
}
//kernel 5x5x6x16 = 25x6x16 =2400
void ConvLayer_3(float input[1176],float *C3_value,float * weights,float * bias){
int k_num,nk_num,i_y,i_x,matrix_x,matrix_y;
int mat_i;
int i=0;
for(nk_num = 0; nk_num < 16; nk_num++){
for(i_y = 0; i_y < 10; i_y++){
for(i_x = 0; i_x < 10; i_x++){
float res = 0;
float res_total_6 = 0;
float matrix[25];
int index_now = i_x + i_y * 10 + nk_num * 100;
for(k_num = 0; k_num < 6; k_num++){
float matrix_2[25];
int input_index_now = k_num*14*14+ i_x + i_y*14;
for(mat_i = 0;mat_i<25;mat_i++){
// int weights_index = mat_i + k_num*25 + (nk_num+1)*150;
int weights_index = mat_i + k_num*25 + (nk_num)*150;
matrix_2[mat_i] = weights[weights_index];
}
for(matrix_y = 0; matrix_y <5; matrix_y++){
for(matrix_x = 0; matrix_x <5; matrix_x++){
int matrix_index = matrix_x + matrix_y * 5;
int input_value_index = input_index_now + matrix_x + matrix_y * 14;
matrix[matrix_index] = input[input_value_index];
}
}
res_total_6 += Conv_5x5(matrix,matrix_2) ;
i++;
}
C3_value[index_now] = res_total_6 + bias[nk_num];
}
}
}
}
//10x10x16
void AvgpoolLayer_4(float input[1600],float *A4_value){
int k_num,i_y,i_x,matrix_x,matrix_y;
int count = 0;
for(k_num = 0; k_num < 16; k_num++){
for(i_y = 0; i_y < 10; i_y+=2){
for(i_x = 0; i_x < 10; i_x+=2){
float matrix[4];
int index_now = i_x + i_y * 10 + k_num * 100;
for(matrix_y = 0; matrix_y < 2; matrix_y++){
for(matrix_x = 0; matrix_x < 2; matrix_x++){
int input_index = index_now + matrix_x + matrix_y * 10 ;
matrix[matrix_x + matrix_y*2] = input[input_index];
}
}
// A4_value[count] = sigmoid(AvgPool_2x2(matrix));
A4_value[count] = AvgPool_2x2(matrix);
count++;
}
}
}
}
//kernel 400x120 = 48000
void FullyConnLayer_5(float input[400],float *F5_value,float * weights, float * bias){
int i_y,i_x;
for(i_y = 0; i_y < 60; i_y++){
float res = 0;
for(i_x = 0; i_x < 400; i_x++){
int index = i_x + i_y * 400;
res += input[i_x] * weights[index];
}
F5_value[i_y] = sigmoid(res + bias[i_y]);
}
}
//kernel 84x120 = 10080
void FullyConnLayer_6(float input[60],float *F6_value,float * weights, float * bias){
int i_y,i_x;
for(i_y = 0; i_y < 16; i_y++){
float res = 0;
for(i_x = 0; i_x < 60; i_x++){
int index = i_x + i_y * 60;
res += input[i_x] * weights[index];
}
F6_value[i_y] = sigmoid(res + bias[i_y]);
}
}
//kernel 10x120 = 1200
void FullyConnLayer_7(float input[16],float *F7_value,float * weights, float * bias){
int i_y,i_x;
for(i_y = 0; i_y < 10; i_y++){
float res = 0;
for(i_x = 0; i_x < 16; i_x++){
int index = i_x + i_y * 16;
res += input[i_x] * weights[index];
}
F7_value[i_y] = res + bias[i_y];
}
}
hls_avalon_slave_component
component int LeNet5(
hls_avalon_slave_memory_argument(1024 *sizeof(float)) float *img,
hls_avalon_slave_memory_argument(150 *sizeof(float)) float *c1_weight,
hls_avalon_slave_memory_argument(6 *sizeof(float)) float *c1_bias,
hls_avalon_slave_memory_argument(2400 *sizeof(float)) float *c3_weight,
hls_avalon_slave_memory_argument(16 *sizeof(float)) float *c3_bias,
hls_avalon_slave_memory_argument(24000*sizeof(float)) float *c5_weight,
hls_avalon_slave_memory_argument(60 *sizeof(float)) float *c5_bias,
hls_avalon_slave_memory_argument(960 *sizeof(float)) float *f6_weight,
hls_avalon_slave_memory_argument(16 *sizeof(float)) float *f6_bias,
hls_avalon_slave_memory_argument(160 *sizeof(float)) float *f7_weight,
hls_avalon_slave_memory_argument(10 *sizeof(float)) float *f7_bias
)
{
//The output of each layer
float C1_value[4704];//28x28x6
float A2_value[1176];//14x14x6
float C3_value[1600];//10x10x16
float A4_value[400];//5x5x16
float F5_value[60];
float F6_value[16];
float F7_value[10];
int i, ret;
float temp = 0.0;
// calulation of each layer
ConvLayer_1(img,C1_value,c1_weight,c1_bias);
AvgpoolLayer_2(C1_value,A2_value);
ConvLayer_3(A2_value,C3_value,c3_weight,c3_bias);
AvgpoolLayer_4(C3_value,A4_value);
FullyConnLayer_5(A4_value,F5_value,c5_weight,c5_bias);
FullyConnLayer_6(F5_value,F6_value,f6_weight,f6_bias);
FullyConnLayer_7(F6_value,F7_value,f7_weight,f7_bias);
// printf("%f,%f\n",F7_value[0],F7_value[1]);
for(i = 0; i < 10; i++){
if(F7_value[i] > temp){
temp = F7_value[i];
ret = i;
}
}
return ret;
}
hps端代码
/*
* main.c
*
* Created on: 2022年7月24日
* Author: 86130
*/
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#define soc_cv_av
#include "hwlib.h"
#include "socal/socal.h"
#include "socal/hps.h"
#include "hps_0.h"
#include "layer1_0_weight.h"
#include "layer1_0_bias.h"
#include "layer2_0_weight.h"
#include "layer2_0_bias.h"
#include "fc_weight.h"
#include "fc_bias.h"
#include "fc1_weight.h"
#include "fc2_weight.h"
#include "fc1_bias.h"
#include "fc2_bias.h"
#include "input_0.h"
#include "input_1.h"
#include "input_2.h"
#include "input_3.h"
#include "input_4.h"
#include "input_5.h"
#include "input_6.h"
#include "input_7.h"
#include "input_8.h"
#include "input_9.h"
#define HW_REGS_BASE (ALT_STM_OFST) //HPS 外设地址段基地址
#define HW_REGS_SPAN (0x04000000) //HPS 外设地址段地址空间 64MB 大小
#define HW_REGS_MASK (HW_REGS_SPAN - 1) //HPS 外设地址段地址掩码
//接口定义(结构体的方式)
typedef struct{
volatile float *img;
volatile float *c1_w;
volatile float *c1_b;
volatile float *c3_w;
volatile float *c3_b;
volatile float *c5_w;
volatile float *c5_b;
volatile float *f6_w;
volatile float *f6_b;
volatile float *f7_w;
volatile float *f7_b;
}fc_port_def;
fc_port_def fc_port;
typedef struct{
volatile long long busy;
volatile long long start;
volatile long long ire_en;
volatile long long done;
volatile long long result;
}fc_ctrl_def;
fc_ctrl_def *fc_ctrl;
int fc_init(void *virtual_base){
void *fc_ctrl_addr;
fc_ctrl_addr = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_CRA_BASE) & (unsigned long)(HW_REGS_MASK));
fc_ctrl = (fc_ctrl_def*)fc_ctrl_addr; //接口映射
fc_ctrl->start = 0;
fc_port.img = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_IMG_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c1_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C1_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c1_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C1_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c3_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C3_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c3_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C3_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c5_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C5_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c5_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C5_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.f6_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_F6_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.f6_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_F6_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.f7_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_F7_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.f7_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_F7_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
//加载权重参数、偏置参数
memcpy(fc_port.c1_w,layer1_0_weight,150*sizeof(float));
memcpy(fc_port.c1_b,layer1_0_bias,6*sizeof(float));
memcpy(fc_port.c3_w,layer2_0_weight,2400*sizeof(float));
memcpy(fc_port.c3_b,layer2_0_bias,16*sizeof(float));
memcpy(fc_port.c5_w,fc_weight,24000*sizeof(float));
memcpy(fc_port.c5_b,fc_bias,60*sizeof(float));
memcpy(fc_port.f6_w,fc1_weight,960*sizeof(float));
memcpy(fc_port.f6_b,fc1_bias,16*sizeof(float));
memcpy(fc_port.f7_w,fc2_weight,160*sizeof(float));
memcpy(fc_port.f7_b,fc2_bias,10*sizeof(float));
return 0;
}
const float
*imgx[10]={input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9};
int main(){
int fd,i;
void *virtual_base;
float time_s,time_ns,time_ms;
struct timespec ts1,ts2;
//1.打开 MMU open()
fd = open("/dev/mem",(O_RDWR | O_SYNC));
if(fd == (-1)){
printf("ERROR:could not open\"/dev/mem\"...\n");
return 1;
}
//2.将外设地址空间映射到用户空间mmap()
virtual_base = mmap(NULL,HW_REGS_SPAN,( PROT_READ |
PROT_WRITE ),MAP_SHARED,fd,HW_REGS_BASE);
//3.初始化(一般是自己写的函数 )
fc_init(virtual_base);
//4.对外设进行相应的操作
while(1){
for(i=0;i<10;i++)
{
memcpy(fc_port.img,imgx[i],1024*sizeof(float));
clock_gettime(CLOCK_MONOTONIC,&ts1); //记录函数开始时间
fc_ctrl->start = 1;//打开推理
while((fc_ctrl->done & 0x02) == 0);//当 done 不为 2 的时候(推理未完成),就阻塞(等待)
printf("%d",fc_ctrl->done);
fc_ctrl->start = 0; //推理完成,关闭使能
clock_gettime(CLOCK_MONOTONIC,&ts2); //记录函数结束时间
//由于总的时间=time_s+time_ns
//为了显示方便,将总的时间统一转化为毫秒
time_s = ts2.tv_sec - ts1.tv_sec;
time_ns = ts2.tv_nsec - ts1.tv_nsec;
time_ms = time_s*1000 + time_ns/1000000;
printf("predict time:%.6f ms\n",time_ms);
printf("input:%d,predict result:%d\n",i,fc_ctrl->result);
}
break;
}
//5.取消映射munmap()
if(munmap(virtual_base,HW_REGS_SPAN)!=0){
printf("ERROR:munmap()failed...\n");
close(fd);
return 1;
}
//6.关闭设备描述符close()
close(fd);
return 0;
}