一. DNN介绍
深度神经网络(Deep Neural Networks, 以下简称DNN)是深度学习的基础,而要理解DNN,首先我们要理解DNN模型,下面我们就对DNN的模型与前向传播算法做一个总结。
DNN神经网络的组成:
- 输入层:神经网络的第一层,原始的样本数据
- 隐藏层:除了输入层,输出层,中间的都是隐藏层
- 输出层:神经网络的最后一层,最终的计算结果
神经网络的特点:
- 每个连接都有个权值
- 同一层神经元之间没有连接
- 最后的输出结果对应的层也称之为全连接层
二. 摒弃神经网络的所有框架,纯手写dnn神经网络,实现mnist手写体识别,具体代码如下:
import ssl
import numpy as np
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.datasets import fetch_openml
from tensorflow.examples.tutorials.mnist import input_data
ssl._create_default_https_context = ssl._create_unverified_context
def y_hot(y_true):
"""y_hot"""
y_one = np.zero(10)
y_one[int(y_ture)] = 1
return y_one
def log_loss(y_true,y_prob):
"""
计算loss
"""
y_prob = np.clip(y_prob, 1e-10, 1 - 1e-10)
if y_prob.shape[1] == 1:
y_prob = np.append(1 - y_prob, y_prob, axis = 1)
if y_true.shape[1] == 1
y_true = np.append(1 - y_true, y_true, axis = 1)
return -np.sum(y_true * np.log(y_prob) /y_prob。shape[0])
def softmax(x):
tmp = x - x.max(axis=1)[:, np.newaxis]
np.exp(tmp, out = x)
x /= x.sum(axis = 1)[:,np.newaxis]
return x
def relu(x):
np.clip(x, 0, np.finfo(x.dtype).max, out=x)
return x
def relu_delta(z, delta):
"""
relu激活函数导数
"""
delta[z==0] = 0
def gen_batches(n, bs):
"""batche 数据"""
start =0
for _ in range(int(n // bs):
end = start + bs
yield slice(start,end)
if start < n:
yield slice(start, n)
#读取数据
mnist = input_data.read_data_sets("./data/mnist/")
x = mnist.train.images
y = mnist.train.labels
# 标签数据one_hot
y = np.array([y_hot(y[i]) for i in range(len(y))])
# 神经网络参数
hidden_layers_sizes = [300,100]
max_iter = 200
alpha = 0.0001
lr = 0.001
# 构建网络
n_samples,n_features = x.shape
n_outputs = y.shape[1]
batch_size = min (200, n_samples)
layer_units = ([n_featutes] + hidden_layers_size + [n_outputs])
n_layers = len(layers_units)
# 初始化w,b
coefs_ = list()
intercept_ = list()
for i in range(n_layers-1): # 取消输出层的参数初始化
# 取前2层做初始化
fan_in = layer_units[i]
fan_out = layer_units[i+1]
# xavier初始化
factor = 6
init_bound = np.sqrt(factor / ( fan_in + fan_out))
coefs_init = np.random.uniform(-init_bound, init_bound, (fan_in, fan_out))
intercept_init = np.random.uniform(-init_bound, init_bound, fan_out)
coefs_.append(coefs_init)
intercept_.append(intercept_init)
# 初始化一些集合用于存放层层结果
activations = [x]
activations.extend(np.empty((batch_size, n_fan_out)) for n_fan_out in layer_units[1:])
deltas = [np.empty_like(a_layer) for a_layer in activations]
# 初始化层与层之间w矩阵的梯度gradient
coef_grads = [np.empty((n_fan_in,n_fan_out)) for n_fan_in, n_fan_out in zip(layer_units[:-1],layer_units[1:])
intercept_grads = [np.empty((n_fan_out)) for n_fan_out in layer_units[1:]]
# train
for it in range(max_iter):
arr = np.arange(n_samples)
np.random.shuffle(arr)
x = x[arr]
y = y[arr]
accmult_loss = 0.0
for batch_slice in get_batches(n_samples, batch_size):
batch_x = x[batch_slice]
batch_y = y[batch_slice]
# 赋值数据
activations[0] = batch_x
# 正向传播
for i in range(n_layers -1):
activations[i + 1] = safe_sparse_dot(activations[i], coefs_[i])
activations[i + 1] += intercept_[i]
# 隐藏层做relu
if (i+1) != (n_layers -1):
activations[i+1] = relu(activations[i+1])
# 输出层计算y_hat
activations[i+1] = softmax(activations[i+1])
# 计算avg_loss
loss = log_loss(batch_y, activations[-1])
# 添加l2
values = np.sum(np.array([np.dot(s.ravel(), s.ravle()) for s in coefs_]]
loss += (0.5 * alpha) * values /len(batch_y)
accmult_loss += loss * len(batch_y)
# 反向传播
last = n_layers -2 # 第一个要开始计算的索引
deltas[last] = activatoins[-1] - batch_y
# 计算倒数第一个w矩阵的梯度,及从输出层返回的梯度
coef_grads[last] = safe_sparse_dot(activations[last].T, deltas[last]) # base loss
coef_grads += (alpha * coef_grads[last]) # l2 loss
coef_grads[last] /= n_samples # avg loss
intercept_grads[last] = np.mean(deltas[last], 0) # bias loss
# 迭代计算各个隐藏层w矩阵梯度
for i in range(n_layers-2, 0, -1):
# deltas_previous = deltas * w * 激活函数
deltas[i - 1] = safe_sparse_dot(deltas[i], coefs_[i].T)
# 激活函数的导数
relu_delta(activations[i], deltas[i-1])
# loss grads
coef_grads[i -1] = safe_sparse_dot(activations[i -1].T, deltas[i-1]) # base
coef_grads[i- 1] += (alpha * coefs_[i - 1]) # l2
coef_grads[i-1] /= n_samples # avg
intercept_grads[i-1] =np.mean(deltas[i-1],0)
# 更新梯度
grads = coef_grads + intercept_grads
updates = [ -lr * grad for grad in grads]
params = coefs_ + intercept_
for param, update in zip(params,updates):
param += update
loss_ = accmult_loss / x.shape[0]
print(f"iterration {it}, loss {loss_}")