# 神经网络实例Python

本文采用一个模拟数据集进行神经网络的训练，相关知识点包括数据预处理、BN层、神经网络模型、梯度反向传播、梯度检查、监测训练过程、超参数随机搜索等，使读者掌握一个完整的机器学习流程。

### 1、生成数据

生成一个线性不可分的数据集，它是一个随时间增长的振荡数据。

import numpy as np
import matplotlib.pyplot as plt

# 显示数据
def show_data(X, labels):
plt.scatter(X[:, 0], X[:, 1], c=labels, s=10, cmap=plt.cm.Spectral)
plt.show()

def gen_random_data(dim, N_class, num_samp_per_class):
num_examples = num_samp_per_class*N_class
X = np.random.randn(num_examples, dim) # data matrix (each row = single example)
labels = np.random.randint(N_class, size = num_examples) # class labels
show_data(X, labels)
return (X, labels)

dim = 2  # dimensionality
N_class = 4  # number of classes
layer_param = [dim, 10, 20, N_class]
(X, labels) = gen_random_data(dim, N_class, num_samp_per_class=20)

# 生成数据
def gen_toy_data(dim, N_class, num_samp_per_class):
num_examples = num_samp_per_class * N_class
X = np.zeros((num_examples, dim))  # data matrix
labels = np.zeros(num_examples, dtype='uint8')  # class labels
for j in range(N_class):
ix = range(num_samp_per_class * j, num_samp_per_class * (j + 1))
x = np.linspace(-np.pi, np.pi, num_samp_per_class) + 5  # x axis
y = np.sin(x + j * np.pi / (0.5 * N_class))
y += 0.2 * np.sin(10 * x + j * np.pi / (0.5 * N_class))
y += 0.25 * x + 10
y += np.random.randn(num_samp_per_class) * 0.1  # noise

X[ix] = np.c_[x, y]
labels[ix] = j

show_data(X, labels)
return (X, labels)

### 2、数据预处理

def normalize(X):
# (x-u)/delta
mean = np.mean(X, axis=0)
X_norm = X - mean
std = np.std(X_norm, axis=0)
X_norm /= std + 10**(-5)
return (X_norm, mean, std)

PCA白化

def PCA_white(X):
mean = np.mean(X, axis=0)
X_norm = X - mean
cov = np.dot(X_norm.T, X_norm)/X_norm.shape[0]
U,S,V = np.linalg.svd(cov)
X_norm = np.dot(X_norm, U)
X_norm /= np.sqrt(S + 10**(-5))
return (X_norm, mean, U, S)

def split_data(X, labels):
split1 = 2
split2 = 4
num_examples = X.shape[0]
shuffle_no = list(range(num_examples))
np.random.shuffle(shuffle_no)

X_train = X[shuffle_no[:num_examples//split1]]
labels_train = labels[shuffle_no[:num_examples//split1]]
X_val = X[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]]
labels_val = labels[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]]
X_test = X[shuffle_no[-num_examples//4:]]
labels_test = labels[shuffle_no[-num_examples//4:]]
return (X_train, labels_train, X_val, labels_val, X_test, labels_test)

def data_preprocess(X_train, X_val, X_test):
(X_train_pca, mean, U, S) = PCA_white(X_train)
X_val_pca = np.dot(X_val - mean, U)
X_val_pca /= np.sqrt(S + 10**(-5))
X_test_pca = np.dot(X_test - mean, U)
X_test_pca /= np.sqrt(S + 10**(-5))
return (X_train_pca, X_val_pca, X_test_pca)

### 3、网络模型

# layer_param = [dim, 100, 100, N_class]
def initialize_parameters(layer_param):
weights = []
biases = []
vweights = []
vbiases = []

for i in range(len(layer_param) - 1):
in_depth = layer_param[i]
out_depth = layer_param[i+1]
std = np.sqrt(2/in_depth)*0.5
weights.append(std*np.random.randn(in_depth, out_depth))
biases.append((np.zeros((1, out_depth))))
vweights.append(np.zeros((in_depth, out_depth)))
vbiases.append(np.zeros((1, out_depth)))
return (weights, biases, vweights, vbiases)

def forward(X, layer_param, weights, biases):
hiddens = []
hiddens.append(X)
for i in range(len(layer_param)-2):
hiddens.append(np.maximum(0, np.dot(hiddens[i], weights[i]) + biases[i]))
# 最后一层不需要非线性激活函数
scores = np.dot(hiddens[-1], weights[-1]) + biases[-1]
return (hiddens, scores)

def data_loss_softmax(scores, labels):
num_examples = scores.shape[0]
exp_scores = np.exp(scores)
exp_cores_sum = np.sum(exp_scores, axis=1)
corect_probs = exp_scores[range(num_examples), labels]/exp_cores_sum
corect_logprobs = -np.log(corect_probs)
data_loss = np.sum(corect_logprobs)/num_examples
return data_loss

def reg_L2_loss(weights, reg):
reg_loss = 0
for weight in weights:
reg_loss += 0.5*reg*np.sum(weight*weight)
return reg_loss

def dscores_softmax(scores, labels):
num_examples = scores.shape[0]
exp_scores = np.exp(scores)
probs = exp_scores/np.sum(exp_scores, axis=1, keepdims=True)
dscores = probs
dscores[range(num_examples), labels] -= 1
dscores /= num_examples
return dscores

def predict(X, labels, layer_param, weights, biases):
hidden = X
for i in range(len(layer_param)-2):
hidden = np.maximum(0, np.dot(hidden, weights[i]) + biases[i])
scores = np.dot(hidden, weights[-1]) + biases[-1]
predicted_class = np.argmax(scores, axis=1)
right_class = predicted_class == labels
return np.mean(right_class)

def gradient_backprop(dscores, hiddens, weights, biases, reg):
dweights = []
dbiases = []
dhidden = dscores
for i in range(len(hiddens)-1, -1, -1):
dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i])
dbiases.append(np.sum(dhidden, axis=0, keepdims=True))
dhidden = np.dot(dhidden, weights[i].T)
dhidden[hiddens[i]<=0] = 0
return (dweights, dbiases)

### 4、梯度检查

def check_gradient(X, labels, layer_param, check_weight_or_bias):
(X, labels) = gen_random_data(dim, N_class, num_samp_per_class=200)
(weights, biases, vweights, vbiases) = initialize_parameters(layer_param)
reg = 10**(-9)
step = 10**(-5)
for layer in range(len(weights)):
if check_weight_or_bias:
row = np.random.randint(weights[layer].shape[0])
col = np.random.randint(weights[layer].shape[1])
param = weights[layer][row][col]
else:
row = np.random.randint(weights[layer].shape[1])
param = biases[layer][0][row]
# 前向计算
(hiddens, scores) = forward(X, layer_param, weights, biases)
dscores = dscores_softmax(scores, labels)
# 反向计算
(dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
if check_weight_or_bias:
danalytic = dweights[-1-layer][row][col]
else:
danalytic = dbiases[-1-layer][0][row]
if check_weight_or_bias:
weights[layer][row][col] = param - step
else:
biases[layer][0][row] = param - step
(hiddens, scores) = forward(X, layer_param, weights, biases)
data_loss1 = data_loss_softmax(scores, labels)
reg_loss1 = reg_L2_loss(weights, reg)
loss1 = data_loss1 + reg_loss1
if check_weight_or_bias:
weights[layer][row][col] = param + step
else:
biases[layer][0][row] = param + step
(hiddens, scores) = forward(X, layer_param, weights, biases)
data_loss2 = data_loss_softmax(scores, labels)
reg_loss2 = reg_L2_loss(weights, reg)
loss2 = data_loss2 + reg_loss2
dnumeric = (loss2 - loss1)/(2*step)

print(layer, data_loss1, data_loss2)
error_relative = np.abs(danalytic - dnumeric)/np.maximum(danalytic, dnumeric)
print(danalytic, dnumeric, error_relative)

### 5、参数优化

def nesterov_momentumSGD(vparams, params, dparams, lr, mu):
update_ratio = []
for i in range(len(params)):
pre_vparam = vparams[i]
vparams[i] = mu*vparams[i] - lr*dparams[-1-i]
update_param = vparams[i] + mu * (vparams[i] - pre_vparam)
params[i] += update_param
update_ratio.append(np.sum(np.abs(update_param))
/np.sum(np.abs(params[i])))
return update_ratio

### 6、训练网络

def train_net(X_train, labels_train, layer_param, lr,
lr_decay, reg, mu, max_epoch, X_val, labels_val):
lr0 = lr
(weights, biases, vweights, vbiases) = initialize_parameters(layer_param)
epoch = 0
data_losses = []
reg_losses = []
val_accuracy = []
train_accuracy = []
weights_update_ratio = []
baises_update_ratio = []
while epoch < max_epoch:
(hiddens, scores) = forward(X_train, layer_param, weights, biases)
val_accuracy.append(predict(X_val, labels_val, layer_param, weights, biases))
train_accuracy.append(predict(X_train, labels_train, layer_param, weights, biases))
data_loss = data_loss_softmax(scores, labels_train)
reg_loss = reg_L2_loss(weights, reg)
dscores = dscores_softmax(scores, labels_train)
(dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
weights_update_ratio.append(nesterov_momentumSGD(
vweights, weights, dweights, lr, mu
))
baises_update_ratio.append(nesterov_momentumSGD(
vbiases, biases, dbiases, lr, mu
))
data_losses.append(data_loss)
reg_losses.append(reg_loss)
epoch += 1
lr *= lr_decay
# 可视化数据损失、训练集和验证集准确率
plt.close()
fig = plt.figure('loss')
ax.grid(True)
ax2.grid(True)
plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
fontsize=14)
plt.ylabel('                              accuracy       log10(data loss)', fontsize=14)
ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b', marker='.')
#    ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')
ax2.scatter(np.arange(len(val_accuracy) - 0), val_accuracy[0:], c='r', marker='*')
ax2.scatter(np.arange(len(val_accuracy) - 0), train_accuracy[0:], c='g', marker='.')
#    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
#    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
plt.show()

# %% 对数显示每层权重和偏置的更新率，合理值在10**(-3)
for layer in range(len(weights)):
wur = []
for i in range(len(weights_update_ratio)):
wur.append(weights_update_ratio[i][layer])
bur = []
for i in range(len(baises_update_ratio)):
bur.append(baises_update_ratio[i][layer])
plt.close()
fig = plt.figure('update ratio')
ax.grid(True)
ax2.grid(True)
plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
fontsize=14)
ax.scatter(np.arange(len(wur)), np.log10(wur), c='b', marker='.')
ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r', marker='*')
plt.show()

return (data_losses, reg_losses, weights, biases, val_accuracy)

### 7、过拟合小数据集

def overfit_tinydata(lr=10 ** (-0.0), lr_decay=1, mu=0.9, reg=0, max_epoch=100):
(X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=2)
X,_,_,_ = PCA_white(X)
layer_param = [dim, 100, 100, N_class]
(data_losses, reg_losses, weights, biases, accuracy) = train_net(X, labels, layer_param, lr, lr_decay, reg, mu,
max_epoch, X, labels)

return (data_losses, reg_losses, accuracy)

### 8、超参数随机搜索

def hyperparam_random_search(X_train, labels_train,
X_val, labels_val,
layer_param, num_try=10,
lr=[-1, -5], lr_decay=1,
mu=0.9, reg=[-2.0, -5.0],
max_epoch=500):
minlr = min(lr)
maxlr = max(lr)
randn = np.random.rand(num_try*2)
lr_array = 10**(minlr + (maxlr-minlr)*randn[0:num_try])
minreg = min(reg)
maxreg = max(reg)
reg_array = 10**(minreg + (maxreg-minreg)*randn[num_try:2*num_try])
lr_regs = zip(lr_array, reg_array)
for lr_reg in lr_regs:
(data_loss, reg_loss, weights, biases, val_accuracy) = train_net(X_train, labels_train, layer_param,
lr_reg[0], lr_decay, lr_reg[1], mu, max_epoch,
X_val, labels_val)
return (weights, biases)
dim = 2  # dimensionality
N_class = 4  # number of classes
layer_param = [dim, 100, 100, N_class]
(X,labels) = gen_toy_data(dim, N_class, num_samp_per_class = 200)
(X_train, labels_train, X_val, labels_val, X_test, labels_test) = split_data(X, labels)
(X_train_pca, X_val_pca, X_test_pca) = data_preprocess(X_train, X_val, X_test)
(weights, biases) = hyperparam_random_search(X_train_pca, labels_train, X_val_pca, labels_val, layer_param,
num_try = 2, lr=[-0.1, -0.2], lr_decay=1, mu=0.9, reg=[-2, -5], max_epoch=2000)


### 9、评估模型

accuracy = predict(X_test_pca, labels_test, layer_param, weights, biases)
print(accuracy) # 0.785

### 10、增加BN层

增加BN层，不会改变程序的总体流程。但是需要注意的是，这里不再需要偏置参数，而是增加了标准查和均值参数，同时需要保留BN层的一些中间结果，如均值和方差等。注意，由于采用全部样本进行训练，所以均值和方差不需要进行移动平均。

BN_EPSILON = 10**(-5)

def initialize_parameters_BN(layer_param):
weights = []
gammas = []
betas = []

vweights = []
vgammas = []
vbetas = []

for i in range(len(layer_param) - 1):
in_depth = layer_param[i]
out_depth = layer_param[i+1]
std = np.sqrt(2/in_depth)
weights.append( std * np.random.randn(in_depth, out_depth) )
gammas.append( np.ones((1, out_depth)) )
betas.append( np.zeros((1, out_depth)) )

vweights.append( np.zeros( (in_depth, out_depth) ) )
vgammas.append( np.zeros((1, out_depth)) )
vbetas.append( np.zeros((1, out_depth)) )

params = (weights, gammas, betas)
vparams = (vweights, vgammas, vbetas)
return (params, vparams)

def BN_forward(X, gamma, beta):
mu = np.mean(X, axis=0)
var = np.var(X, axis=0)
std = np.sqrt(var + BN_EPSILON)
X_hat = (X - mu)/std
Y = gamma*X_hat + beta
bn_cache = (mu, var, std)

return (Y, bn_cache, X_hat)

def BN_backprop(dY, X, X_hat, bn_cache, gamma, beta):
(mu, var, std) = bn_cache
batch = X.shape[0]
dX_hat = gamma*dY
dbeta = np.sum(dY, axis=0, keepdims=True)
dgamma = np.sum(X_hat*dY, axis=0, keepdims=True)

dX = dX_hat/std
dmu = -np.sum(dX_hat, axis=0, keepdims=True)/std
dstd = -1/(std*std) * np.sum((X - mu)*dX_hat, axis=0)
dvar = 1/2*(var**(-0.5))*dstd
dX += 2*(X - mu)/batch * dvar
dX += dmu/batch

return (dX, dgamma, dbeta)

def forward_BN(X, layer_param, params):
hiddens = []
bn_ins = []
bn_caches = []
bn_hats = []

(weights, gammas, betas) = params
hiddens.append(X)
for i in range(len(layer_param)-1):
hidden = np.dot(hiddens[i], weights[i]) # 1
bn_ins.append(hidden)
(hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i]) # 2
bn_hats.append(bn_hat)
bn_caches.append(bn_cache)
if i < len(layer_param)-2:
hiddens.append(np.maximum(0, hidden)) # 3
else:
scores = hidden # 4
return (hiddens, scores, bn_ins, bn_hats, bn_caches)

def predict_BN(X, labels, layer_param, params):
(weights, gammas, betas) = params
hidden = X
for i in range(0,len(layer_param)-1):
hidden = np.dot(hidden, weights[i])
(hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i])
if i < len(layer_param)-2:
hidden = np.maximum(0, hidden)
else:
scores = hidden

predicted_class = np.argmax(scores, axis=1)
rigth_class = predicted_class == labels
return np.mean(rigth_class)

def gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg):
(weights, gammas, betas) = params
dweights = []
dgammas = []
dbetas = []

dhidden = dscores
for i in range(len(hiddens)-1, -1, -1):
(dhidden, dgamma, dbeta) = BN_backprop(dhidden, bn_ins[i], bn_hats[i], bn_caches[i], gammas[i], betas[i])
dgammas.append(dgamma)
dbetas.append(dbeta)
dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i] )
dhidden = np.dot(dhidden, weights[i].T)
dhidden[hiddens[i] <= 0] = 0
return (dweights, dgammas, dbetas)

#%%
def train_net_BN(data, layer_param, super_params, max_epoch):

(X_train, labels_train, X_val, labels_val) = data
(lr_w, lr_decay, mu_w, lr_bn, mu_bn, reg) = super_params
(params, vparams) = initialize_parameters_BN(layer_param)
epoch = 0

data_losses = []
reg_losses = []
val_accuracy = []
train_accuracy = []
weights_update_ratio = []
gammas_update_ratio = []
betas_update_ratio = []

while epoch < max_epoch:
(hiddens, scores, bn_ins, bn_hats, bn_caches) = forward_BN(X_train, layer_param, params)

val_accuracy.append(predict_BN(X_val, labels_val, layer_param, params))
train_accuracy.append(predict_BN(X_train, labels_train, layer_param, params))

data_loss = data_loss_softmax(scores, labels_train)
reg_loss = reg_L2_loss(params[0], reg)
dscores = dscores_softmax(scores, labels_train)

(dweights, dgammas, dbetas) = gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg)
weights_update_ratio.append(nesterov_momentumSGD(vparams[0], params[0], dweights, lr_w, mu_w))
gammas_update_ratio.append(nesterov_momentumSGD(vparams[1], params[1], dgammas, lr_bn, mu_bn))
betas_update_ratio.append(nesterov_momentumSGD(vparams[2], params[2], dbetas, lr_bn, mu_bn))

data_losses.append(data_loss)
reg_losses.append(reg_loss)
epoch += 1
lr_w *= lr_decay

# 省略可视化结果代码
plt.close()
fig=plt.figure('loss')
#    ax.set_title('data loss')
ax.grid(True)
#    ax2.set_title('reg loss')
ax2.grid(True)
plt.xlabel( 'log10(lr_w)=' + str(round((np.log10(lr_w)),2)) + ' ' +  'log10(reg)=' + str(round((np.log10(reg)),2)), fontsize=14)
plt.ylabel('                              accuracy       log10(data loss)', fontsize=14)
ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b',marker='.')
#    ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')

#    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
#    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
ax2.scatter(np.arange(len(val_accuracy)-0), val_accuracy[0:], c='r',marker='*')
ax2.scatter(np.arange(len(val_accuracy)-0), train_accuracy[0:], c='g',marker='.')
plt.show()

#%%
for layer in range(len(params[0])):
wur = []
for i in range(len(weights_update_ratio)):
wur.append( weights_update_ratio[i][layer] )
bur = []
for i in range(len(betas_update_ratio)):
bur.append( betas_update_ratio[i][layer] )

plt.close()
fig=plt.figure('update ratio')
ax.grid(True)
plt.show()