Vivado HLS实现MNIST手写数字识别（1）

最新推荐文章于 2024-07-22 06:36:58 发布

基尼台妹

最新推荐文章于 2024-07-22 06:36:58 发布

阅读量1.2k

点赞数 2

分类专栏： ZYNQ实现卷积神经网络文章标签：人工智能深度学习 fpga开发神经网络 c++

本文链接：https://blog.csdn.net/Legendyyy/article/details/132782524

版权

ZYNQ实现卷积神经网络专栏收录该内容

3 篇文章

订阅专栏

本文介绍了如何在TensorFlow中设计卷积神经网络并应用到MNIST手写数字识别，随后详细描述了如何将该网络转化为HLS硬件模块并在XilinxZYNQ平台上的Vivado工具进行设计和验证，包括使用VitisSDK进行软件开发和硬件协同工作。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

在设计完HLS卷积和池化硬件模块后，这里通过最基础的MNIST手写数字识别案例将两个模块运用起来。

一、TensorFlow设计卷积神经网络

卷积神经网络在图像处理领域有着很大的优势，本案例通过设计包含两个卷积池化层和两个全连接层的神经网络对MNIST手写数字进行训练识别，最后识别的准确率保持在99%。

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
import tensorflow as tf
import numpy as np
import struct
#记录卷积神经网路的参数，分别存在.dat和.bin文件中（.bin最后放在sd卡中）
def Parameter_record(tensor,name):
    print("Recording "+name+"...")
    f = open('./record/'+name+".dat",'w')
    #wb指以二进制的方式写文件
    b = open('./record/'+name+".bin",'wb')
    #.eval(用来获取变量的返回值，和Session类似)
    array = tensor.eval()
    dimensions = np.size(np.shape(array))
    if(dimensions==1):
        Array1D_record(array,f,b)
    elif(dimensions==2):
        Array2D_record(array,f,b)
    elif(dimensions==3):
        Array3D_record(array,f,b)
    else:
        Array4D_record(array,f,b)

def Array1D_record(array,f,b):
    for i in range(np.shape(array)[0]):
        f.write(str(array[i])+"\n")
        #struct.pack将Python中的浮点类型转换为c语言中所使用的二进制数据类型
        b.write(struct.pack('f', array[i]))

def Array2D_record(array,f,b):
    for i in range(np.shape(array)[0]):
        for j in range(np.shape(array)[1]):
            f.write(str(array[i][j])+"\n")
            b.write(struct.pack('f', array[i][j]))

def Array3D_record(array,f,b):
    for i in range(np.shape(array)[0]):
        for j in range(np.shape(array)[1]):
            for m in range(np.shape(array)[2]):
                f.write(str(array[i][j][m])+"\n")
                b.write(struct.pack('f', array[i][j][m]))

def Array4D_record(array,f,b):
    for i in range(np.shape(array)[0]):
        for j in range(np.shape(array)[1]):
            for m in range(np.shape(array)[2]):
                for n in range(np.shape(array)[3]):
                    f.write(str(array[i][j][m][n])+"\n")
                    b.write(struct.pack('f', array[i][j][m][n]))

#每个批次的大小
batch_size =50
#训练集一共有多少批次
n_batch = mnist.train.num_examples//batch_size

#初始化权值
def weight_variable(shape):
    # 正态分布，标准差为 0.1，默认最大为 1，最小为 -1，均值为 0
    return tf.Variable(tf.truncated_normal(shape,stddev=0.1))

#初始化偏置
def bias_variable(shape):
    return tf.Variable(tf.zeros(shape)+0.1)

#卷积层
def conv2d(x,W):
    #x input tensor of shape [batch,in_height,in_weight,in_channels]
    #W filter/kernel tensor of shape [filter_height,filter_width,in_channels,out_channels]
    #stride[1] x方向上的步长 stride[2] y方向上的步长
    return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding="SAME")

#池化层
def max_pool_2x2(x):
    #ksize=[1,x,y,1]
    return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME")
#存放输入图像和标签
x = tf.placeholder(tf.float32,[None,784])
y = tf.placeholder(tf.float32,[None,10])
#改变x的格式转为4D的向量[batch,in_height,in_weight,in_channels]
x_image = tf.reshape(x,[-1,28,28,1])
#初始化第一个卷积层的权值和偏置
W_conv1 = weight_variable([3,3,1,12])#3*3的采样窗口，12个卷积核从1个平面提出特征
b_conv1 = bias_variable([12])#每个卷积核对应着一个偏置
#将x_image的权值向量进行卷积，再加上偏置值，然后应用于relu激活函数，接着再进过最大池化
h_conv1 = tf.nn.relu(conv2d(x_image,W_conv1)+b_conv1)
h_pool1 = max_pool_2x2(h_conv1)#[batch,14,14,12]

W_conv2 = weight_variable([3,3,12,24])#3*3的采样窗口，24个卷积核从12个平面提出特征
b_conv2 = bias_variable([24])

h_conv2 = tf.nn.relu(conv2d(h_pool1,W_conv2)+b_conv2)
h_pool2 = max_pool_2x2(h_conv2)#[batch,7,7,24]
#初始化第一个全连接层
W_fc1 = weight_variable([7*7*24,96])#上一场有7*7*24个神经元，全连接层有96个神经元
b_fc1 = bias_variable([96])

h_pool2_flat = tf.reshape(h_pool2,[-1,7*7*24])
#求第一个全连接层的输出
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1)+b_fc1)
#keep_prob用来表示神经元输出的概率
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1,keep_prob)
#初始化第二个全连接层
W_fc2 = weight_variable([96,10])
b_fc2 = bias_variable([10])
#计算输出
prediction = tf.nn.softmax(tf.matmul(h_fc1_drop,W_fc2)+b_fc2)
#交叉熵代价函数
cross_entroy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=prediction))
train_step = tf.train.AdadeltaOptimizer(learning_rate=1.5).minimize(cross_entroy)
correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大值所在位置
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

init =  tf.initialize_all_variables()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(30):
        for batch in range(n_batch):
            x_data,y_data = mnist.train.next_batch(batch_size)
            sess.run(train_step,feed_dict={x:x_data,y:y_data,keep_prob:0.85})
        acc = sess.run(accuracy,{x:mnist.test.images,y:mnist.test.labels,keep_prob:1.0})
        print("Iter"+str(epoch)+",Testing Accuracy="+str(acc))
    Parameter_record(W_conv1, 'W_conv1')
    Parameter_record(b_conv1, 'b_conv1')
    Parameter_record(W_conv2, 'W_conv2')
    Parameter_record(b_conv2, 'b_conv2')
    Parameter_record(W_fc1, 'W_fc1')
    Parameter_record(b_fc1, 'b_fc1')
    Parameter_record(W_fc2, 'W_fc2')
    Parameter_record(b_fc2, 'b_fc2')

最后训练的权值参数为了方便读取，以.bin二进制的形式存储，可以放在sd卡中供ZYNQ进行读取；.dat存储的浮点类型的数据，方便用户查看。

二、Vivado Block Design硬件设计

首先新建项目，这里我使用的是ZYNQ7020的开发板。

点击Settings->IP->Repository将先前设计的HLS项目所在路径添加进去，系统会提示检测出两个IP核，分别是设计的卷积IP和池化IP。

点击Create Block Design，点击+号先将ZYNQ7 Processing System添加进去，Run Block Automation，本案例中需要使用到ZYNQ的HP0和HP1接口，将其勾选。

同时本案例需要用到串口打印消息，还需要用到SD卡读取权值和图片，所以需要分别使能UART0和SD0。

最后根据板子实际配置DDR控制器。

接着添加刚刚导入的两个IP核。

Run Connection Automation，这里注意HP1接口需要连接Pool模块

完整的Block Design。

设计好Block Design后，Validate Design验证设计的合理性，接着在菜单栏右键分别执行Generate Output Products和Create HDL Wrapper，然后Generate Bitstream。操作无误后，File->Export->Export Hardware。

三、Vitis SDK设计

我用的Vivado版本为2019.2，配套的SDK软件为Vitis，在Tool->Launch Vitis，Create Platform Project->Create from hardware specification(XSA)，选中上一步在Export Hardware生成的XSA文件。

系统会生成platform后，在生成的platform上面右键New Application，命名后选择刚刚生成platform。

接着一路next，模板选择Hello World。由于案例要使用FATFS，所以要在platform下的板级支持包包添加文件系统的相关库函数。

在弹出的界面中勾选“xilffs”， xilffs即为FATFS库。

配置完成后，编写函数利用HLS生成的卷积池化硬件电路实现TensoFlow中设计的卷积神经网络的效果。这里全连接层可以看成特殊的卷积层，此时卷积核的大小与输入图像大小相等且Padding等于Valid。

#include <stdio.h>
#include "platform.h"
#include "xil_printf.h"
#include "xil_cache.h"
#include "Convolution.h"
#include "Pool.h"
#include "sd.h"

//Weight of Conv1
float image[28][28][1];
float W_conv1[3][3][1][12];
float b_conv1[12];
float h_conv1[28][28][12];
float h_pool1[14][14][12];

//Weight of Conv2
float W_conv2[3][3][12][24];
float b_conv2[24];
float h_conv2[14][14][24];
float h_pool2[7][7][24];

//Weight of FC1
float W_fc1[7*7*24][96];
float b_fc1[96];
float h_fc1[96];

//Weight of FC2
float W_fc2[96][10];
float b_fc2[10];
float h_fc2[10];

int main()
{
    init_platform();
    Xil_DCacheDisable();

    XConv xconv;
    if(XConv_Initialize(&xconv,XPAR_CONV_0_DEVICE_ID)!=XST_SUCCESS)
    	xil_printf("XConv device not found\r\n");

    XPool xpool;
    if(XPool_Initialize(&xpool,XPAR_POOL_0_DEVICE_ID)!=XST_SUCCESS)
    	xil_printf("XPool device not found\r\n");
    //初始化SD卡
    SD_Init();
    print("Hello World\r\n");

    LoadWeight("W_conv1.bin",3*3*1*12,W_conv1[0][0][0]);
    LoadWeight("b_conv1.bin",12,b_conv1);

    LoadWeight("W_conv2.bin",3*3*12*24,W_conv2[0][0][0]);
    LoadWeight("b_conv2.bin",24,b_conv2);

    LoadWeight("W_fc1.bin",7*7*24*96,W_fc1[0]);
    LoadWeight("b_fc1.bin",96,b_fc1);

    LoadWeight("W_fc2.bin",96*10,W_fc2[0]);
    LoadWeight("b_fc2.bin",10,b_fc2);

    //while(1)
    for(int i=0;i<10000;i++)
    {
		int label;
		//读取测试集的第i张图片到image数组中，图片对应的标签为label
		rdMNISTs28(i, image, &label);

		//Conv1
		RunConv(&xconv,1,28,28,12,//CHin,Hin,Win,CHout
				3,3,1,1,1,1,//Kx,Ky,Sx,Sy,mode,relu_en
				image[0][0],W_conv1[0][0][0],b_conv1,h_conv1[0][0]);//feature_in,W,bias,feature_out
		RunPool(&xpool,12,28,28,//CHin,Hin,Win
				2,2,2,//Kx,Ky,mode
				h_conv1[0][0],h_pool1[0][0]);//feature_in,feature_out

		//Conv2
		RunConv(&xconv,12,14,14,24,//CHin,Hin,Win,CHout
				3,3,1,1,1,1,//Kx,Ky,Sx,Sy,mode,relu_en
				h_pool1[0][0],W_conv2[0][0][0],b_conv2,h_conv2[0][0]);//feature_in,W,bias,feature_out
		RunPool(&xpool,24,14,14,//CHin,Hin,Win
				2,2,2,//Kx,Ky,mode
				h_conv2[0][0],h_pool2[0][0]);//feature_in,feature_out

		//FC1
		RunConv(&xconv,24,7,7,96,//CHin,Hin,Win,CHout
				7,7,1,1,0,1,//Kx,Ky,Sx,Sy,mode,relu_en
				h_pool2[0][0],W_fc1[0],b_fc1,h_fc1);//feature_in,W,bias,feature_out

		//FC2
		RunConv(&xconv,96,1,1,10,//CHin,Hin,Win,CHout
				1,1,1,1,0,1,//Kx,Ky,Sx,Sy,mode,relu_en
				h_fc1,W_fc2[0],b_fc2,h_fc2);//feature_in,W,bias,feature_out
		//相当于softmax的效果
		float max=-10000;int num=0;
		for(int m=0;m<10;m++)
		{
			if(h_fc2[m]>max)
			{
				max=h_fc2[m];
				num=m;
			}
		}
		xil_printf("predicted=%d, label=%d\r\n",num,label);
    }

    cleanup_platform();
    return 0;
}

串口将打印出搭建MNIST图像经过硬件得到的手写数字的识别结果和其对应标签，如果相同，说明硬件设计没有问题。

四、案例验证

首先制作sd卡，需要是FATFS格式的，这里我使用DiskGenius这个软件进行格式化。格式化完以后，将卷积神经网络训练好的权值和测试用的数据集标签放在sd卡中，这里测试用的数据集标签为解压后的t10k-images-idx3-ubyte.gz以及t10k-labels-idx1-ubyte.gz，分别重命名为Testimgs.x3以及Testlbls.x1。