非神经网络框架--纯手写dnn神经网络,实现mnist识别

Starry-sky(jing)

已于 2022-09-01 21:49:57 修改

阅读量668

点赞数 2

分类专栏：神经网络杂记 python 文章标签：深度学习神经网络 dnn 人工智能 python

于 2022-09-01 21:39:08 首次发布

本文链接：https://blog.csdn.net/llm765800916/article/details/126570928

版权

python 同时被 2 个专栏收录

21 篇文章 1 订阅

订阅专栏

神经网络杂记

4 篇文章 0 订阅

订阅专栏

一. DNN介绍

深度神经网络（Deep Neural Networks，以下简称DNN）是深度学习的基础，而要理解DNN，首先我们要理解DNN模型，下面我们就对DNN的模型与前向传播算法做一个总结。

DNN神经网络的组成：

输入层：神经网络的第一层，原始的样本数据
隐藏层：除了输入层，输出层，中间的都是隐藏层
输出层：神经网络的最后一层，最终的计算结果

神经网络的特点：

每个连接都有个权值
同一层神经元之间没有连接
最后的输出结果对应的层也称之为全连接层

在这里插入图片描述

二. 摒弃神经网络的所有框架，纯手写dnn神经网络，实现mnist手写体识别,具体代码如下：

import ssl
import numpy as np
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.datasets import fetch_openml
from tensorflow.examples.tutorials.mnist import input_data

ssl._create_default_https_context = ssl._create_unverified_context

def y_hot(y_true):
	"""y_hot"""
	y_one = np.zero(10)
	y_one[int(y_ture)] = 1
	return y_one

def log_loss(y_true,y_prob):
	"""
	计算loss
	"""
	y_prob = np.clip(y_prob, 1e-10, 1 - 1e-10)
	if y_prob.shape[1] == 1:
		y_prob = np.append(1 - y_prob, y_prob, axis = 1)
	if y_true.shape[1] == 1
		y_true = np.append(1 - y_true, y_true, axis = 1)
	return -np.sum(y_true * np.log(y_prob) /y_prob。shape[0])

def softmax(x):
	 tmp = x - x.max(axis=1)[:, np.newaxis]
	 np.exp(tmp, out = x)
	 x /= x.sum(axis = 1)[:,np.newaxis]
	 return x
	 
def relu(x):
	np.clip(x, 0, np.finfo(x.dtype).max, out=x)
	return x

def relu_delta(z, delta):
	"""
	relu激活函数导数
	"""
	delta[z==0] = 0

def gen_batches(n, bs):
	"""batche 数据"""
	start =0
	for _ in range(int(n // bs):
		end = start + bs
		yield slice(start,end)
	if start < n:
		yield slice(start, n)



#读取数据
mnist = input_data.read_data_sets("./data/mnist/")
x = mnist.train.images
y = mnist.train.labels

# 标签数据one_hot
y = np.array([y_hot(y[i]) for i in range(len(y))])

# 神经网络参数
hidden_layers_sizes = [300,100]
max_iter = 200
alpha = 0.0001
lr = 0.001


# 构建网络
n_samples,n_features = x.shape
n_outputs = y.shape[1]
batch_size = min (200, n_samples)
layer_units = ([n_featutes] + hidden_layers_size + [n_outputs]) 
n_layers = len(layers_units)

# 初始化w,b
coefs_ = list()
intercept_ = list()
for i in range(n_layers-1): # 取消输出层的参数初始化
	# 取前2层做初始化
	fan_in = layer_units[i]
	fan_out = layer_units[i+1]
	
	# xavier初始化
	factor = 6
	init_bound = np.sqrt(factor / ( fan_in + fan_out))
	coefs_init = np.random.uniform(-init_bound, init_bound, (fan_in, fan_out))
	intercept_init = np.random.uniform(-init_bound, init_bound, fan_out)
	coefs_.append(coefs_init)
	intercept_.append(intercept_init)


# 初始化一些集合用于存放层层结果
activations = [x]
activations.extend(np.empty((batch_size, n_fan_out)) for n_fan_out in layer_units[1:])
deltas = [np.empty_like(a_layer) for a_layer in activations]


# 初始化层与层之间w矩阵的梯度gradient
coef_grads = [np.empty((n_fan_in,n_fan_out)) for n_fan_in, n_fan_out in zip(layer_units[:-1],layer_units[1:])
intercept_grads = [np.empty((n_fan_out)) for n_fan_out in layer_units[1:]]

# train
for it in range(max_iter):
	arr = np.arange(n_samples)
	np.random.shuffle(arr)
	x = x[arr]
	y = y[arr]
	accmult_loss = 0.0
	for batch_slice in get_batches(n_samples, batch_size):
		batch_x = x[batch_slice]
		batch_y = y[batch_slice]	
		# 赋值数据
		activations[0] = batch_x
		
		# 正向传播
		for i in range(n_layers -1):
			activations[i + 1] = safe_sparse_dot(activations[i], coefs_[i])
			activations[i + 1] += intercept_[i]
			
			# 隐藏层做relu
			if (i+1) != (n_layers -1):
				activations[i+1] = relu(activations[i+1])
			
		# 输出层计算y_hat
		activations[i+1] = softmax(activations[i+1])
	
		# 计算avg_loss
		loss = log_loss(batch_y, activations[-1])
		# 添加l2
		values = np.sum(np.array([np.dot(s.ravel(), s.ravle()) for s in coefs_]]
		loss += (0.5 * alpha) * values /len(batch_y)
		accmult_loss += loss * len(batch_y)
		
		# 反向传播
		last = n_layers -2 # 第一个要开始计算的索引
		deltas[last] = activatoins[-1] - batch_y 
		
		# 计算倒数第一个w矩阵的梯度，及从输出层返回的梯度 
		coef_grads[last] = safe_sparse_dot(activations[last].T, deltas[last]) # base loss
		coef_grads += (alpha * coef_grads[last]) # l2 loss
		coef_grads[last] /= n_samples # avg loss
		intercept_grads[last] = np.mean(deltas[last], 0) # bias loss
		
		# 迭代计算各个隐藏层w矩阵梯度
		for i in range(n_layers-2, 0, -1):
			# deltas_previous = deltas * w * 激活函数
			deltas[i - 1] = safe_sparse_dot(deltas[i], coefs_[i].T)
			
			# 激活函数的导数
			 relu_delta(activations[i], deltas[i-1])
			
			# loss grads
			coef_grads[i -1]  = safe_sparse_dot(activations[i -1].T, deltas[i-1]) # base 
			coef_grads[i- 1]  += (alpha * coefs_[i - 1]) # l2 
			coef_grads[i-1] /= n_samples # avg 
			intercept_grads[i-1] =np.mean(deltas[i-1],0)
			
		# 更新梯度
		grads = coef_grads + intercept_grads
		updates = [ -lr * grad for grad in grads]
		params = coefs_ +  intercept_
		for param, update in zip(params,updates):
			param += update
	loss_ = accmult_loss / x.shape[0]
	print(f"iterration {it}, loss {loss_}")