自定义卷积神经网络-CSDN博客

本文链接：https://blog.csdn.net/weixin_43828944/article/details/126732957

软件端代码

import torch
import torch.nn as nn
from torchvision.transforms import transforms
from torchsummary import summary
from torch.utils.data import DataLoader
import torchvision
import matplotlib.pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#模型超参数
batch_size = 64
learing_rate = 0.001
num_epochs = 10
num_classes = 10
#下载数据
train_dataset = torchvision.datasets.MNIST(root='./data',
                                          train=True,
                                          download=True,
                                          transform=transforms.Compose([
                                              transforms.Resize((28,28)),
                                              transforms.ToTensor()
                                          ]))
test_dataset = torchvision.datasets.MNIST(root='./data',
                                          train=False,
                                          download=True,
                                          transform=transforms.Compose([
                                              transforms.Resize((28,28)),
                                              transforms.ToTensor()
                                          ]))
train_loader = DataLoader(dataset=train_dataset,
                         batch_size=batch_size,
                         shuffle=True)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         shuffle=True)
#自定义模型                         
class CustomConv(nn.Module):
    def __init__(self, num_classes):
        super(CustomConv, self).__init__()
        self.conv = nn.Conv2d(1, 6, 3)
        self.fc = nn.Linear(4056, 10)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x
        
model = CustomConv(num_classes)
summary(model, input_size=(1,28,28))
cost = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learing_rate)
total_step = len(train_loader)
#训练模型
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        loss = cost(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if(i % 100) == 0:
            print("Epoch[{}/{}] Step[{}/{}] Loss[{:.4f}]".format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))
#测试模型            
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print("Accurcy of model on {} test images is {}%".format(total, correct/total * 100))
img_iter = next(iter(test_loader))
images, labels = img_iter
images = images.numpy()
labels = labels.numpy()
i = 0
for img, lab in zip(images[:9], labels[:9]):
    i += 1
    plt.subplot(3,3,i)
    plt.title('label: '+ str(lab))
    plt.imshow(img.transpose(1,2,0))
torch.save(model, "model.pth")
model_load = torch.load("model.pth")
param = {}
for (name, layer) in model_load.state_dict().items():
    param[name] = layer
#提取模型参数
for key in param.keys():
    shape = param[key].shape
    str_list = str(param[key].tolist())
    str_list = str_list.replace("[", "")
    str_list = str_list.replace("]", "")
    with open(key.replace(".", "_") + ".h", "w") as f:
        if len(shape) == 4:
            f.write("float " + key.replace(".", "_") + "[" + str(shape[0]*shape[1]*shape[2]*shape[3]) + "]" + " = {" + str_list + "};\n\n")
        elif len(shape) == 2:
            f.write("float " + key.replace(".", "_") + "[" + str(shape[0]*shape[1]) + "]" + " = {" + str_list + "};\n\n")
        else:
            f.write("float " + key.replace(".", "_") + "[" + str(shape[0]) + "]" + " = {" + str_list + "};\n\n")
    print(key, "save successfully!")
#提取图像参数
for img, lab in zip(images, labels):
    shape = img.shape
    img_list = str(img.tolist())
    img_list = img_list.replace("[", "")
    img_list = img_list.replace("]", "")
    with open("input_" + str(lab) + ".h", "w") as f:
        f.write("float " + "input_" + str(lab) + "[" + str(shape[1] * shape[2]) + "]" + " = {" + img_list + "};\n\n")

hls端代码

#include "HLS/hls.h"
#include "HLS/stdio.h"
#include "conv_weight.h"
#include "conv_bias.h"
#include "fc_weight.h"
#include "fc_bias.h"
#include "input_0.h"
#include "input_1.h"
#include "input_2.h"
#include "input_3.h"
#include "input_4.h"
#include "input_5.h"
#include "input_6.h"
#include "input_7.h"
#include "input_8.h"
#include "input_9.h"
float conv3x3(float input[9], float kernel[9]){
    int i, j;
    float ret;
    ret = 0.0;
    for(i = 0; i < 3; i++){
        for(j = 0; j < 3; j++){
            ret += input[j + i * 3] * kernel[j + i * 3];
        }
    }
    return ret;
}
//激活函数relu
float Relu(float x){
    return x > 0.0? x: 0.0;
}
// 3x3x6
void Conv_layer(float img[784], float *weight, float *bias, float *C_value){
    int i, j;
    int k_num, mat_i;
    int matrix_i, matrix_j;
    for(k_num = 0; k_num < 6; k_num++){
        float matrix[9];
        for(mat_i = 0; mat_i < 9; mat_i++){
            matrix[mat_i] = weight[mat_i + k_num * 9];
        }
        //conv operation
        for(i = 0; i < 26; i++){
            for(j = 0; j < 26; j++){
                float matrix1[9];
                int pic_index = j + i * 28;
                for(matrix_i = 0; matrix_i < 3; matrix_i++){
                    for(matrix_j = 0; matrix_j < 3; matrix_j++){
                        int matrix_index = matrix_j + matrix_i * 3;
                        int input_index = pic_index + matrix_j + matrix_i * 28;
                        matrix1[matrix_index] = img[input_index];
                    }
                }
                int out_index = j + i * 26 + k_num * 676;
                float z = conv3x3(matrix, matrix1) + bias[k_num];//加上偏置
                C_value[out_index] = Relu(z);//激活
            }
        }
    }
}
void FullConnect(float input[4056], float *weight, float *bias, float *F_value){
    int i, j;
    float z;
    for(i = 0; i < 10; i++){
        z = 0.0;
        for(j = 0; j < 4056; j++){
            z += input[j] * weight[j + i * 4056];
        }
        z += bias[i];
        F_value[i] = z;
    }
}
hls_avalon_slave_component
component int CustomConv(
    hls_avalon_slave_memory_argument(784 * sizeof(float)) float *img, 
    hls_avalon_slave_memory_argument(54 * sizeof(float)) float *conv_weight, 
    hls_avalon_slave_memory_argument(6 * sizeof(float)) float *conv_bias, 
    hls_avalon_slave_memory_argument(40560 * sizeof(float)) float *fc_weight, 
    hls_avalon_slave_memory_argument(10 * sizeof(float)) float *fc_bias
    ){
    float C_value[4056];
    float F_value[10];
    Conv_layer(img, conv_weight, conv_bias, C_value);
    FullConnect(C_value, fc_weight, fc_bias, F_value);
    int i, ret;
    float temp = 0.0;
    for(i = 0; i < 10; i++){
        if (F_value[i] > temp){
            temp = F_value[i];
            ret = i;
        }
    }
    return ret;
}

int main(){
    int i, ret;
    float *img[10] = {input_0, input_1, input_2, input_3, input_4,
     input_5, input_6, input_7,input_8, input_9};
    for(i = 0; i < 10; i++){
        ret = CustomConv(img[i], conv_weight, conv_bias, fc_weight, fc_bias);
        printf("The result is: %d\n", ret);
    }
}

hps端代码

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
#define soc_cv_av
#include "hwlib.h"
#include "socal/socal.h"
#include "socal/hps.h"
#include "hps_0.h"
#include "layer1_0_weight.h"
#include "layer1_0_bias.h"
#include "fc_weight.h"
#include "fc_bias.h"
#include "fc1_weight.h"
#include "fc2_weight.h"
#include "input_0.h"
#include "input_1.h"
#include "input_2.h"
#include "input_3.h"
#include "input_4.h"
#include "input_5.h"
#include "input_6.h"
#include "input_7.h"
#include "input_8.h"
#include "input_9.h"
#define HW_REGS_BASE (ALT_STM_OFST) //HPS 外设地址段基地址
#define HW_REGS_SPAN (0x04000000) //HPS 外设地址段地址空间 64MB 大小
#define HW_REGS_MASK (HW_REGS_SPAN - 1) //HPS 外设地址段地址掩码
//接口定义（结构体的方式）
typedef struct{
	volatile float *img;
	volatile float *c1_w;
	volatile float *c1_b;
	volatile float *f2_w;
	volatile float *f2_b;
}fc_port_def;
fc_port_def fc_port;
typedef struct{
	volatile long long busy;
	volatile long long start;
	volatile long long ire_en;
	volatile long long done;
	volatile long long result;
}fc_ctrl_def;
fc_ctrl_def *fc_ctrl;

int fc_init(void *virtual_base){
	void *fc_ctrl_addr;
	fc_ctrl_addr = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
			LENET5_0_LENET5_INTERNAL_INST_AVS_CRA_BASE) & (unsigned long)(HW_REGS_MASK));
	fc_ctrl = (fc_ctrl_def*)fc_ctrl_addr; //接口映射
	fc_ctrl->start = 0;
	fc_port.img = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
			LENET5_0_LENET5_INTERNAL_INST_AVS_IMG_BASE) & (unsigned long)(HW_REGS_MASK));
	fc_port.c1_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
			LENET5_0_LENET5_INTERNAL_INST_AVS_C1_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
	fc_port.c1_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
			LENET5_0_LENET5_INTERNAL_INST_AVS_C1_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
	fc_port.f2_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
			LENET5_0_LENET5_INTERNAL_INST_AVS_F6_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));
	fc_port.f2_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +
			LENET5_0_LENET5_INTERNAL_INST_AVS_F6_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
//加载权重参数、偏置参数
	memcpy(fc_port.c1_w,layer1_0_weight,54*sizeof(float));
	memcpy(fc_port.c1_b,layer1_0_bias,6*sizeof(float));
	memcpy(fc_port.f2_w,fc1_weight,40560*sizeof(float));
	memcpy(fc_port.f2_b,fc1_bias,10*sizeof(float));
	return 0;
}
const float
*imgx[10]={input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9};
int main(){
	int fd,i;
	void *virtual_base;
	float time_s,time_ns,time_ms;
	struct timespec ts1,ts2;
//1.打开 MMU open()

	fd = open("/dev/mem",(O_RDWR | O_SYNC));
	if(fd == (-1)){
		printf("ERROR:could not open\"/dev/mem\"...\n");
		return 1;
	}
//2.将外设地址空间映射到用户空间mmap()
	virtual_base = mmap(NULL,HW_REGS_SPAN,( PROT_READ |
			PROT_WRITE ),MAP_SHARED,fd,HW_REGS_BASE);
//3.初始化（一般是自己写的函数 ）
	fc_init(virtual_base);
//4.对外设进行相应的操作
	while(1){
		for(i=0;i<10;i++)
		{
			memcpy(fc_port.img,imgx[i],784*sizeof(float));
			clock_gettime(CLOCK_MONOTONIC,&ts1); //记录函数开始时间
			fc_ctrl->start = 1;//打开推理
			while((fc_ctrl->done & 0x02) == 0);//当 done 不为 2 的时候（推理未完成），就阻塞（等待）
			printf("%d",fc_ctrl->done);
			fc_ctrl->start = 0; //推理完成，关闭使能
			clock_gettime(CLOCK_MONOTONIC,&ts2); //记录函数结束时间
//由于总的时间=time_s+time_ns
//为了显示方便，将总的时间统一转化为毫秒
			time_s = ts2.tv_sec - ts1.tv_sec;
			time_ns = ts2.tv_nsec - ts1.tv_nsec;
			time_ms = time_s*1000 + time_ns/1000000;
			printf("predict time:%.6f ms\n",time_ms);
			printf("input:%d,predict result:%d\n",i,fc_ctrl->result);
		}
		break;
}
//5.取消映射munmap()
if(munmap(virtual_base,HW_REGS_SPAN)!=0){
	printf("ERROR:munmap()failed...\n");
	close(fd);
	return 1;
}
//6.关闭设备描述符close()
close(fd);
return 0;
}