提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
《卷积神经网络的Python实现》笔记3
神经网络的代码实现
采用模拟随机生成数据,进行神经网络训练,把前面的相关知识整合在一起,包括数据预处理、神经网络模型、梯度反向传播、梯度检查、检测训练过程、超参数随机搜索等
一、生成数据
生成一组数据
num_samp_per_class = 200 #每种类别200个样例
dim = 2 #深度为2,即2维特征
N_class = 4 #四种类别
def gen_toy_data(dim, N_class, num_samp_per_class):
num_examples = num_samp_per_class*N_class
X = np.zeros((num_examples, dim))
labels = np.zeros(num_examples,dtype= 'uint8')
for j in range(N_class):
ix = range(num_samp_per_class*j,num_samp_per_class*(j+1))
x = np.linspace(-np.pi, np.pi, num_samp_per_class) + 5
y = np.sin(x + j*np.pi/(0.5*N_class))
y += 0.2*np.sin(10*x +j*np.pi/(0.5*N_class))
y += 0.25*x + 10#在非线性的基础上加一个线性增长
y += np.random.randn(num_samp_per_class)*0.1#增加噪声
X[ix] = np.c_[x,y]#矩阵左右相加
labels[ix] = j#标签
return (X,labels)
可视化数据
import matplotlib.pyplot as plt
def show_data(X,labels):
plt.scatter(X[:,0],X[:,1],c=labels,s=40,cmap=plt.cm.Spectral)
plt.show()
二、 数据预处理
中心化和归一化
def normalize(X):#中心化和归一化
mean = np.mean(X,axis = 0)#取均值
X_norm = X - mean #中心化
std = np.std(X_norm ,axis= 0)#计算标准差
X_norm /= std+ 10**(-5)#规范化
return (X_norm,mean,std)
PCA和白化
def PCA_white(X):#PCA和白化操作
mean = np.mean(X, axis=0)
X_xorn = X - mean
cov = np.dot(X_xorn.T , X_xorn)/X_xorn.shape[0]
U,S,V = np.linalg.svd(cov)
X_xorn = np.dot(X_xorn, U)#奇异值分解
X_xorn /= np.sqrt(S+ 10**(-5))#白化操作,几何意义为每个属性的方差相同,后面加10**(-5)是加一个小常数,防止存在为0的方差
return (X_xorn,mean,U,S)
对随机切分数据
def split_data(X,labels):#按照2:1:1分为训练集,验证集和测试集
num_examples = X.shape[0]
shuffle_no = list(range(num_examples))
np.random.shuffle(shuffle_no)#打乱顺序,类似于洗牌
X_train = X[shuffle_no[:num_examples//2]]
labels_train = labels[shuffle_no[:num_examples//2]]
X_val = X[shuffle_no[num_examples//2:num_examples//2+num_examples//4]]
label_val = labels[shuffle_no[num_examples//2:num_examples//2+num_examples//4]]
X_test = X[shuffle_no[-num_examples//4:]]
labels_test = labels[shuffle_no[-num_examples//4:]]
return (X_train,labels_train,X_val,label_val,X_test,labels_test)
预处理数据
def data_preprocess(X_train,X_val,X_test):#使用训练集的值对测试集和验证集进行PCA和白化
(X_train_pca,mean,U,S) = PCA_white(X_train)
X_val_pca = np.dot(X_val-mean,U)
X_val_pca /= np.sqrt(S + 10**(-5))
X_test_pca = np.dot(X_test-mean,U)
X_test_pca /= np.sqrt(S+10**(-5))
return (X_train_pca,X_val_pca,X_test_pca)
三、网络模型
初始化权重
def initialize_parameters(laryer_param):#权重初始化
weights = []
biases = []
vweight = []
vbiases = []
for i in range(len(laryer_param) - 1):
in_depth = laryer_param[i]
out_depth = laryer_param[i+1]
std = np.sqrt(2/in_depth)*0.5#乘以0.5,使初始数据损失接近-log(1/N_class)
weights.append(std*np.random.randn(in_depth,out_depth))
biases.append(np.zeros((1,out_depth)))
vweight.append(np.zeros((in_depth,out_depth)))
vbiases.append(np.zeros((1,out_depth)))
return (weights, biases , vweight,vbiases)
前向计算算法
def forward(X,layer_param,weights,biases):#模型前向计算代码
hiddens = []
hiddens.append(X)
for i in range(len(layer_param)-2):
hiddens.append(np.maximum(0,np.dot(hiddens[i],weights[i])+biases[i]))
scores = np.dot(hiddens[-1],weights[-1]) +biases[-1]
return (hiddens,scores)
softmax损失函数
def data_loss_softmax(scores, labels):#计算softmax损失函数
num_examples = scores.shape[0]
exp_scores = np.exp(scores)
exp_cores_num = np.sum(exp_scores ,axis=1)
corect_probs = exp_scores[range(num_examples),labels]/exp_cores_num
corect_logprobs = -np.log(corect_probs)
data_loss = np.sum(corect_logprobs)/num_examples
return data_loss
L2范数损失
def reg_L2_loss(weights,reg):#计算L2范数损失
reg_loss = 0
for weight in weights:
reg_loss += 0.5*reg*np.sum(weight*weight)
return reg_loss
计算分值矩阵梯度
def dscores_softmax(scores,labels):#计算分值矩阵梯度
num_examples = scores.shape[0]
exp_scores = np.exp(scores)
probs = exp_scores/np.sum(exp_scores ,axis= 1,keepdims=True)
dscores = probs
dscores[range(num_examples),labels] -= 1
dscores /= num_examples
return dscores
准确率预测
def predict(X, labels, layer_param ,weights ,biases):#准确率预测,predict函数和前向函数forward几乎一致,只是不需要保存hidden神经元
hidden = X
for i in range(len(layer_param)-2):
hidden = np.maximum(0, np.dot(hidden,weights[i])+biases[i])
scores = np.dot(hidden,weights[-1]) +biases[-1]
predicted_class = np.argmax(scores ,axis=1)
right_class = predicted_class == labels
return np.mean(right_class)
梯度反向传播算法
def gradient_backprop(dscores, hidden, weights, biases, reg):#梯度反向传播算法
dweights = []
dbiases = []
dhidden = dscores
for i in range(len(hidden)-1, -1, -1):
dweights.append(np.dot(hidden[i].T, dhidden)+ reg*weights[i])
dbiases.append(np.sum(dhidden, axis = 0,keepdims=True))
dhidden = np.dot(dhidden, weights[i].T)
dhidden[hidden[i]<=0] = 0
return (dweights,dbiases)
四、 梯度检查
def gen_random_data(dim, N_class, num_samp_per_class):#梯度检查
num_example = num_samp_per_class*N_class
X = np.random.randn(num_example, dim)
labels = np.random.randint(N_class, size=num_example)
return (X,labels)
def check_gradient(X, labels, layer_param, check_weight_or_bias):
#(X, labels) = gen_random_data(dim,N_class,num_samp_per_class=200)
#layer_param = [dim,N_class]
#layer_param = [dim,10,20,N_class]
#check_weight_or_bias: 1 for weight, 0 for bias
(weights, biases, vweights ,vbiases)= initialize_parameters(layer_param)
reg = 10**(-9)
step = 10**(-5)
for layer in range(len(weights)):
if check_weight_or_bias:
row= np.random.randint(weights[layer].shape[0])
col= np.random.randint(weights[layer].shape[1])
param= weights[layer][row][col]
else:
row= np.random.randint(biases[layer].shape[1])
param = biases[layer][0][row]
(hiddens, scores) = forward(X,layer_param,weights,biases)
dscores= dscores_softmax(scores,labels)
(dweights,dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
if check_weight_or_bias:
danalytic = dweights[-1-layer][row][col]
else:
danalytic = dbiases[-1-layer][0][row]
if check_weight_or_bias:
weights[layer][row][col] = param-step
else:
biases[layer][0][row]=param-step
(hiddens,scores) = forward(X,layer_param,weights,biases)
data_loss1 =data_loss_softmax(scores, labels)
reg_loss1 = reg_L2_loss(weights, reg)
loss1 = data_loss1 +reg_loss1
if check_weight_or_bias:
weights[layer][row][col] = param+step
else:
biases[layer][0][row] = param+step
(hiddens,scoeres) = forward(X,layer_param,weights,biases)
data_loss2 = data_loss_softmax(scores,labels)
reg_loss2 = reg_L2_loss(weights,reg)
loss2 = data_loss2+reg_loss2
dnumric = (loss2-loss1)/(2*step)
print(layer,data_loss1,data_loss2)
error_relative= np.abs(danalytic-dnumric)/np.maximum(danalytic,dnumric)
print(danalytic,dnumric,error_relative)
五、参数优化
def nesterov_momentumSGD(vparams, params, dparams, lr, mu):#优化参数
updata_radio= []
for i in range(len(params)):
pre_vparam = vparams[i]
vparams[i] = mu*vparams[i] - lr*dparams[-1-i]
updata_param = vparams[i] +mu*(vparams[i]-pre_vparam)
params[i] += updata_param
updata_radio.append(np.sum(np.abs(updata_param))/np.sum(np.abs(params[i])))
return updata_radio
六、训练网络
def train_net(X_train, labels_train, layer_param, lr, lr_decay, reg, mu, max_epoch, X_val, labels_val):
(weights, biases, vweights, vbiases) = initialize_parameters(layer_param)# 1初始化参数
epoch = 0
data_losses = []
reg_losses = []
val_accuracy = []
train_accuracy = []
weights_update_ratio = []
baises_update_ratio = []
while epoch < max_epoch:
(hiddens, scores) = forward(X_train, layer_param, weights, biases) # 2前向计算得到分值矩阵和隐含层激活函数
val_accuracy.append(predict(X_val, labels_val, layer_param, weights, biases)) # 3计算训练集和验证集的准确率
train_accuracy.append(predict(X_train, labels_train, layer_param, weights, biases))
data_loss = data_loss_softmax(scores, labels_train) # 4计算数据损失和正则化损失
reg_loss = reg_L2_loss(weights, reg)
dscores = dscores_softmax(scores, labels_train) # 5开始梯度反向传播,先计算分值矩阵的梯度
(dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg) # 6然后继续进行反向传播
weights_update_ratio.append(nesterov_momentumSGD(vweights, weights, dweights, lr, mu)) # 7计算权重和偏置更新率
baises_update_ratio.append(nesterov_momentumSGD(vbiases, biases, dbiases, lr, mu))
data_losses.append(data_loss)
reg_losses.append(reg_loss)
epoch += 1
lr *= lr_decay # 8进行学习率指数退火
# 可视化数据损失、训练集和验证集准确率
plt.close()
fig = plt.figure('loss')
ax = fig.add_subplot(2, 1, 1)
ax.grid(True)
ax2 = fig.add_subplot(2, 1, 2)
ax2.grid(True)
plt.xlabel('log10(lr)=' + str(round((np.log10(lr)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
fontsize=14)
plt.ylabel(' accuracy log10(data loss)', fontsize=14)
ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b', marker='.')
# ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')
ax2.scatter(np.arange(len(val_accuracy) - 0), val_accuracy[0:], c='r', marker='*')
ax2.scatter(np.arange(len(val_accuracy) - 0), train_accuracy[0:], c='g', marker='.')
# ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
# ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
plt.show()
# %% 对数显示每层权重和偏置的更新率,合理值在10**(-3)
for layer in range(len(weights)):
wur = []
for i in range(len(weights_update_ratio)):
wur.append(weights_update_ratio[i][layer])
bur = []
for i in range(len(baises_update_ratio)):
bur.append(baises_update_ratio[i][layer])
plt.close()
fig = plt.figure('update ratio')
ax = fig.add_subplot(2, 1, 1)
ax.grid(True)
ax2 = fig.add_subplot(2, 1, 2)
ax2.grid(True)
plt.xlabel('log10(lr)=' + str(round((np.log10(lr)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
fontsize=14)
ax.scatter(np.arange(len(wur)), np.log10(wur), c='b', marker='.')
ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r', marker='*')
plt.show()
return (data_losses, reg_losses, weights, biases, val_accuracy)
七、过拟合小数据集处理
def overfit_tinydata(X, labels, layer_param, lr=10 ** (-0.0), lr_decay=1, mu=0.9, reg=0, max_epoch=100):#过拟合,reg=0
# (X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=2)
# X,_,_,_ = PCA_white(X)
# layer_param = [dim, 100, 100, N_class]
(data_losses, reg_losses, weights, biases, accuracy) = train_net(X, labels, layer_param, lr, lr_decay, reg, mu,max_epoch, X, labels)
return (data_losses, reg_losses, accuracy)
# data_loss = 4.223167361579445e-05
八、 超参数随机搜索
#超参数随机搜索,收敛速度快,不需要退火算法,Ir_decay=1
def hyperparam_random_search(X_train, labels_train, X_val, labels_val, layer_param, num_try=10, lr=[-1, -5],
lr_decay=0.997, mu=0.9, reg=[-2.0, -5.0], max_epoch=500):
# (X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=200)
# layer_param = [dim, 100, 100, N_class]
minlr = min(lr)
maxlr = max(lr)
randn = np.random.rand(num_try * 2)
lr_array = 10 ** (minlr + (maxlr - minlr) * randn[0: num_try])
minreg = min(reg)
maxreg = max(reg)
reg_array = 10 ** (minreg + (maxreg - minreg) * randn[num_try: 2 * num_try])
lr_regs = zip(lr_array, reg_array)
for lr_reg in lr_regs:
(data_loss, reg_loss, weights, biases, val_accuracy) = train_net(X_train, labels_train, layer_param, lr_reg[0],
lr_decay, lr_reg[1], mu, max_epoch, X_val,
labels_val)
return (weights, biases)
九、 程序组织结构
if __name__ == '__main__':
# %%
dim = 2 # dimensionality
N_class = 4 # number of classes
# %%
layer_param = [dim, 10, 20, N_class]
(X, labels) = gen_random_data(dim, N_class, num_samp_per_class=20)
for i in range(2):
check_gradient(X, labels, layer_param, 1)
# #%%
layer_param = [dim, 100, 100, N_class]
(X, labels) = gen_toy_data(dim, N_class, num_samp_per_class=2)
X, _, _, _ = PCA_white(X)
(data_losses, reg_losses, accuracy) = overfit_tinydata(X, labels, layer_param, lr=10 ** (-0.5), lr_decay=1, mu=0.9,reg=10 ** (-10), max_epoch=100)
# #%%
layer_param = [dim, 100, 100, N_class]
(X, labels) = gen_toy_data(dim, N_class, num_samp_per_class=200)
(X_train, labels_train, X_val, labels_val, X_test, labels_test) = split_data(X, labels)
(X_train_pca, X_val_pca, X_test_pca) = data_preprocess(X_train, X_val, X_test)
(weights, biases) = hyperparam_random_search(X_train_pca, labels_train, X_val_pca, labels_val, layer_param,num_try=2, lr=[-1, -2.1], lr_decay=1, mu=0.9, reg=[-2, -5],max_epoch=10000)
程序主要由3部分组成的:数据预处理函数、网络模型函数、模型使用函数。
可分别将上述的几部分分为三个py文件,第一,二部分为data_processes.py,第九部分为train.py,其余部分为nn.py,其中train.py中要导入data_processes和nn两个模块。
至此,不带BN层的NN网络就实现完成了。
十、声明
在这里声明一下,本人blog为学习笔记(《卷积神经网络的Python实现》单建华,人民邮电出版社),代码不是原创,只是在源代码中改了一些BUG和加了一些便于理解的注释和大家分享自己的理解和心得。如果需要原版资源,可以去https://www.ituring.com.cn/book/2661下载。昨天看到自己的blog下有一些表扬的话,在虚荣之余还是有些受之有愧。
再者,单建华老师的《卷积神经网路的Python实现》真的是一本十分便于新手理解神经网络和卷积神经网络的一本书,推荐大家图书馆借阅或购买。