- 软件端代码
import torch
import torch.nn as nn
from torchvision.transforms import transforms
from torchsummary import summary
from torch.utils.data import DataLoader
import torchvision
import matplotlib.pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64
learing_rate = 0.001
num_epochs = 10
num_classes = 10
train_dataset = torchvision.datasets.MNIST(root='./data',
train=True,
download=True,
transform=transforms.Compose([
transforms.Resize((28,28)),
transforms.ToTensor()
]))
test_dataset = torchvision.datasets.MNIST(root='./data',
train=False,
download=True,
transform=transforms.Compose([
transforms.Resize((28,28)),
transforms.ToTensor()
]))
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=True)
class CustomConv(nn.Module):
def __init__(self, num_classes):
super(CustomConv, self).__init__()
self.conv = nn.Conv2d(1, 6, 3)
self.fc = nn.Linear(4056, 10)
self.relu = nn.ReLU()
def forward(self, x):
x = self.conv(x)
x = self.relu(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
model = CustomConv(num_classes)
summary(model, input_size=(1,28,28))
cost = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learing_rate)
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
loss = cost(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(i % 100) == 0:
print("Epoch[{}/{}] Step[{}/{}] Loss[{:.4f}]".format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs, dim=1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print("Accurcy of model on {} test images is {}%".format(total, correct/total * 100))
img_iter = next(iter(test_loader))
images, labels = img_iter
images = images.numpy()
labels = labels.numpy()
i = 0
for img, lab in zip(images[:9], labels[:9]):
i += 1
plt.subplot(3,3,i)
plt.title('label: '+ str(lab))
plt.imshow(img.transpose(1,2,0))
torch.save(model, "model.pth")
model_load = torch.load("model.pth")
param = {}
for (name, layer) in model_load.state_dict().items():
param[name] = layer
for key in param.keys():
shape = param[key].shape
str_list = str(param[key].tolist())
str_list = str_list.replace("[", "")
str_list = str_list.replace("]", "")
with open(key.replace(".", "_") + ".h", "w") as f:
if len(shape) == 4:
f.write("float " + key.replace(".", "_") + "[" + str(shape[0]*shape[1]*shape[2]*shape[3]) + "]" + " = {" + str_list + "};\n\n")
elif len(shape) == 2:
f.write("float " + key.replace(".", "_") + "[" + str(shape[0]*shape[1]) + "]" + " = {" + str_list + "};\n\n")
else:
f.write("float " + key.replace(".", "_") + "[" + str(shape[0]) + "]" + " = {" + str_list + "};\n\n")
print(key, "save successfully!")
for img, lab in zip(images, labels):
shape = img.shape
img_list = str(img.tolist())
img_list = img_list.replace("[", "")
img_list = img_list.replace("]", "")
with open("input_" + str(lab) + ".h", "w") as f:
f.write("float " + "input_" + str(lab) + "[" + str(shape[1] * shape[2]) + "]" + " = {" + img_list + "};\n\n")
- hls端代码
#include "HLS/hls.h"
#include "HLS/stdio.h"
#include "conv_weight.h"
#include "conv_bias.h"
#include "fc_weight.h"
#include "fc_bias.h"
#include "input_0.h"
#include "input_1.h"
#include "input_2.h"
#include "input_3.h"
#include "input_4.h"
#include "input_5.h"
#include "input_6.h"
#include "input_7.h"
#include "input_8.h"
#include "input_9.h"
float conv3x3(float input[9], float kernel[9]){
int i, j;
float ret;
ret = 0.0;
for(i = 0; i < 3; i++){
for(j = 0; j < 3; j++){
ret += input[j + i * 3] * kernel[j + i * 3];
}
}
return ret;
}
float Relu(float x){
return x > 0.0? x: 0.0;
}
void Conv_layer(float img[784], float *weight, float *bias, float *C_value){
int i, j;
int k_num, mat_i;
int matrix_i, matrix_j;
for(k_num = 0; k_num < 6; k_num++){
float matrix[9];
for(mat_i = 0; mat_i < 9; mat_i++){
matrix[mat_i] = weight[mat_i + k_num * 9];
}
for(i = 0; i < 26; i++){
for(j = 0; j < 26; j++){
float matrix1[9];
int pic_index = j + i * 28;
for(matrix_i = 0; matrix_i < 3; matrix_i++){
for(matrix_j = 0; matrix_j < 3; matrix_j++){
int matrix_index = matrix_j + matrix_i * 3;
int input_index = pic_index + matrix_j + matrix_i * 28;
matrix1[matrix_index] = img[input_index];
}
}
int out_index = j + i * 26 + k_num * 676;
float z = conv3x3(matrix, matrix1) + bias[k_num];
C_value[out_index] = Relu(z);
}
}
}
}
void FullConnect(float input[4056], float *weight, float *bias, float *F_value){
int i, j;
float z;
for(i = 0; i < 10; i++){
z = 0.0;
for(j = 0; j < 4056; j++){
z += input[j] * weight[j + i * 4056];
}
z += bias[i];
F_value[i] = z;
}
}
hls_avalon_slave_component
component int CustomConv(
hls_avalon_slave_memory_argument(784 * sizeof(float)) float *img,
hls_avalon_slave_memory_argument(54 * sizeof(float)) float *conv_weight,
hls_avalon_slave_memory_argument(6 * sizeof(float)) float *conv_bias,
hls_avalon_slave_memory_argument(40560 * sizeof(float)) float *fc_weight,
hls_avalon_slave_memory_argument(10 * sizeof(float)) float *fc_bias
){
float C_value[4056];
float F_value[10];
Conv_layer(img, conv_weight, conv_bias, C_value);
FullConnect(C_value, fc_weight, fc_bias, F_value);
int i, ret;
float temp = 0.0;
for(i = 0; i < 10; i++){
if (F_value[i] > temp){
temp = F_value[i];
ret = i;
}
}
return ret;
}
int main(){
int i, ret;
float *img[10] = {input_0, input_1, input_2, input_3, input_4,
input_5, input_6, input_7,input_8, input_9};
for(i = 0; i < 10; i++){
ret = CustomConv(img[i], conv_weight, conv_bias, fc_weight, fc_bias);
printf("The result is: %d\n", ret);
}
}
- hps端代码
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#define soc_cv_av
#include "hwlib.h"
#include "socal/socal.h"
#include "socal/hps.h"
#include "hps_0.h"
#include "layer1_0_weight.h"
#include "layer1_0_bias.h"
#include "fc_weight.h"
#include "fc_bias.h"
#include "fc1_weight.h"
#include "fc2_weight.h"
#include "input_0.h"
#include "input_1.h"
#include "input_2.h"
#include "input_3.h"
#include "input_4.h"
#include "input_5.h"
#include "input_6.h"
#include "input_7.h"
#include "input_8.h"
#include "input_9.h"
#define HW_REGS_BASE (ALT_STM_OFST)
#define HW_REGS_SPAN (0x04000000)
#define HW_REGS_MASK (HW_REGS_SPAN - 1)
typedef struct{
volatile float *img;
volatile float *c1_w;
volatile float *c1_b;
volatile float *f2_w;
volatile float *f2_b;
}fc_port_def;
fc_port_def fc_port;
typedef struct{
volatile long long busy;
volatile long long start;
volatile long long ire_en;
volatile long long done;
volatile long long result;
}fc_ctrl_def;
fc_ctrl_def *fc_ctrl;
int fc_init(void *virtual_base){
void *fc_ctrl_addr;
fc_ctrl_addr = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_CRA_BASE) & (unsigned long)(HW_REGS_MASK));
fc_ctrl = (fc_ctrl_def*)fc_ctrl_addr;
fc_ctrl->start = 0;
fc_port.img = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_IMG_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c1_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C1_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.c1_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_C1_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.f2_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_F6_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
fc_port.f2_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
LENET5_0_LENET5_INTERNAL_INST_AVS_F6_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
memcpy(fc_port.c1_w,layer1_0_weight,54*sizeof(float));
memcpy(fc_port.c1_b,layer1_0_bias,6*sizeof(float));
memcpy(fc_port.f2_w,fc1_weight,40560*sizeof(float));
memcpy(fc_port.f2_b,fc1_bias,10*sizeof(float));
return 0;
}
const float
*imgx[10]={input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9};
int main(){
int fd,i;
void *virtual_base;
float time_s,time_ns,time_ms;
struct timespec ts1,ts2;
fd = open("/dev/mem",(O_RDWR | O_SYNC));
if(fd == (-1)){
printf("ERROR:could not open\"/dev/mem\"...\n");
return 1;
}
virtual_base = mmap(NULL,HW_REGS_SPAN,( PROT_READ |
PROT_WRITE ),MAP_SHARED,fd,HW_REGS_BASE);
fc_init(virtual_base);
while(1){
for(i=0;i<10;i++)
{
memcpy(fc_port.img,imgx[i],784*sizeof(float));
clock_gettime(CLOCK_MONOTONIC,&ts1);
fc_ctrl->start = 1;
while((fc_ctrl->done & 0x02) == 0);
printf("%d",fc_ctrl->done);
fc_ctrl->start = 0;
clock_gettime(CLOCK_MONOTONIC,&ts2);
time_s = ts2.tv_sec - ts1.tv_sec;
time_ns = ts2.tv_nsec - ts1.tv_nsec;
time_ms = time_s*1000 + time_ns/1000000;
printf("predict time:%.6f ms\n",time_ms);
printf("input:%d,predict result:%d\n",i,fc_ctrl->result);
}
break;
}
if(munmap(virtual_base,HW_REGS_SPAN)!=0){
printf("ERROR:munmap()failed...\n");
close(fd);
return 1;
}
close(fd);
return 0;
}