这个项目包含了吴恩达机器学习ex4的python实现,主要知识点为反向传播神经网络
1.神经网络
1.1数据可视化
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.io import loadmat
from sklearn.preprocessing import OneHotEncoder
data=loadmat(r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex4\machine-learning-ex4\ex4\ex4data1.mat')
data
{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
'__version__': '1.0',
'__globals__': [],
'X': array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]),
'y': array([[10],
[10],
[10],
...,
[ 9],
[ 9],
[ 9]], dtype=uint8)}
X=data['X']
y=data['y']
X.shape,y.shape
((5000, 400), (5000, 1))
weight=loadmat(r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex4\machine-learning-ex4\ex4\ex4weights.mat')
theta1,theta2=weight['Theta1'],weight['Theta2']
theta1.shape,theta2.shape
((25, 401), (10, 26))
sample_idx = np.random.choice(np.arange(data['X'].shape[0]), 100)
sample_images = data['X'][sample_idx, :]
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(12, 12))
for r in range(10):
for c in range(10):
ax_array[r, c].matshow(np.array(sample_images[10 * r + c].reshape((20, 20))).T,cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
1.2定义前向传播函数和代价函数
def sigmoid(z):
return 1/(1+np.exp(-z))
神经网络结构
def forward_propagate(X,theta1,theta2):
m=X.shape[0]
a1=np.insert(X,0,values=np.ones(m),axis=1)
z2=a1*theta1.T
a2=np.insert(sigmoid(z2),0,values=np.ones(m),axis=1)
z3=a2*theta2.T
h=sigmoid(z3)
return a1,z2,a2,z3,h
这是没有正则化的代价函数
def cost(theta1,theta2,input_size,hidden_size,num_lables,X,y,learning_rate):
m=X.shape[0]
X=np.matrix(X)
y=np.matrix(y)
a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
J=0
for i in range(m):
first_trem=np.multiply(-y[i,:],np.log(h[i,:]))
second_term=np.multiply((1-y[i,:]),np.log(1-h[i,:]))
J+=np.sum(first_trem-second_term)
J=J/m
return J
1.3数据预处理和初始化参数
encoder=OneHotEncoder(sparse=False)#False返回array对象,True返回稀疏矩阵
y_onehot=encoder.fit_transform(y)
y_onehot.shape
(5000, 10)
y[0],y_onehot[0,:]
(array([10], dtype=uint8), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]))
input_size=400
hidden_size=25
num_labels=10
learning_rate=1
计算代价
cost(theta1,theta2,input_size,hidden_size,num_labels,X,y_onehot,learning_rate)
0.2876291651613188
1.4 定义有正则化的代价函数和反向传播函数
正则化代价函数
def costReg(theta1, theta2, input_size, hidden_size, num_labels, X, y, learning_rate):
m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)
# run the feed-forward pass
a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
# compute the cost
J = 0
for i in range(m):
first_term = np.multiply(-y[i,:], np.log(h[i,:]))
second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
J += np.sum(first_term - second_term)
J = J / m
J+=(float(learning_rate)/(2*m))*(np.sum(np.power(theta1[:,1:],2))+np.sum(np.power(theta2[:,1:],2)))
return J
costReg(theta1, theta2, input_size, hidden_size, num_labels, X, y_onehot, learning_rate)
0.38376985909092354
def sigmoid_gradient(z):
return np.multiply(sigmoid(z),(1-sigmoid(z)))
sigmoid_gradient(0)
0.25
随机初始化
params=(np.random.random(size=hidden_size*(input_size+1)+num_labels*(hidden_size+1))-0.5)*0.24
反向传播
def backprop(params,input_size,hidden_size,num_lables,X,y,learning_rate):
m=X.shape[0]
X=np.matrix(X)
y=np.matrix(y)
a1,z2,a2,z3,h=forward_propagate(X,theta1,theta2)
theta1=np.matrix(np.reshape(params[:hidden_size*(input_size+1)],(hidden_size,(input_size+1))))
theta2=np.matrix(np.reshape(params[hidden_size*(input_size+1):],(num_labels,(hidden_size+1))))
J=0
delta1=np.zeros(theta1.shape)
delta2=np.zeros(theta2.shape)
for i in range(m):
first_term = np.multiply(-y[i,:], np.log(h[i,:]))
second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
J += np.sum(first_term - second_term)
J = J / m
for t in range(m):
a1t=a1[t,:]
z2t=z2[t,:]
a2t=a2[t,:]
ht=h[t,:]
yt=y[t,:]
d3t=ht-yt
z2t=np.insert(z2t,0,values=np.ones(1))
d2t=np.multiply((theta2.T*d3t.T).T,sigmoid_gradient(z2t))
delta1=delta1+(d2t[:,1:]).T*a1t
delta2=delta2+d3t.T*a2t
delta1=delta1/m
delta2=delta2/m
return J,delta1,delta2
有正则化的反向传播
def backpropReg(params, input_size, hidden_size, num_labels, X, y, learning_rate):
m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)
# reshape the parameter array into parameter matrices for each layer
theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
# run the feed-forward pass
a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
# initializations
J = 0
delta1 = np.zeros(theta1.shape) # (25, 401)
delta2 = np.zeros(theta2.shape) # (10, 26)
# compute the cost
for i in range(m):
first_term = np.multiply(-y[i,:], np.log(h[i,:]))
second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
J += np.sum(first_term - second_term)
J = J / m
# add the cost regularization term
J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
# perform backpropagation
for t in range(m):
a1t = a1[t,:] # (1, 401)
z2t = z2[t,:] # (1, 25)
a2t = a2[t,:] # (1, 26)
ht = h[t,:] # (1, 10)
yt = y[t,:] # (1, 10)
d3t = ht - yt # (1, 10)
z2t = np.insert(z2t, 0, values=np.ones(1)) # (1, 26)
d2t = np.multiply((theta2.T * d3t.T).T, sigmoid_gradient(z2t)) # (1, 26)
delta1 = delta1 + (d2t[:,1:]).T * a1t
delta2 = delta2 + d3t.T * a2t
delta1 = delta1 / m
delta2 = delta2 / m
delta1[:,1:] = delta1[:,1:] + (theta1[:,1:] * learning_rate)/ m
delta2[:,1:] = delta2[:,1:] + (theta2[:,1:] * learning_rate) / m
grad=np.concatenate((np.ravel(delta1),np.ravel(delta2)))
#拼接两个数组
return J,grad
1.5利用工具库计算最优解
from scipy.optimize import minimize
fmin=minimize(fun=backpropReg,x0=(params),args=(input_size,hidden_size,num_labels, X, y_onehot, learning_rate),method='TNC',jac=True,options={'maxiter':250})
#maxiter 最大迭代次数
fmin
fun: 0.32940356143623817
jac: array([ 1.72209086e-04, 7.10131568e-08, -7.66715574e-08, ...,
3.31147356e-06, -2.45840176e-05, -2.04963093e-04])
message: 'Max. number of function evaluations reached'
nfev: 250
nit: 19
status: 3
success: False
x: array([-7.62726613e-01, 3.55065784e-04, -3.83357787e-04, ...,
-9.58437980e-02, -3.56685537e+00, -5.83283702e-02])
X = np.matrix(X)
thetafinal1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
thetafinal2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
a1, z2, a2, z3, h = forward_propagate(X, thetafinal1, thetafinal2 )
y_pred = np.array(np.argmax(h, axis=1) + 1)
y_pred
array([[10],
[10],
[10],
...,
[ 9],
[ 9],
[ 9]], dtype=int64)
评价报告
from sklearn.metrics import classification_report#这个包是评价报告
print(classification_report(y, y_pred))
precision recall f1-score support
1 0.99 0.99 0.99 500
2 1.00 1.00 1.00 500
3 0.99 0.99 0.99 500
4 1.00 0.99 1.00 500
5 1.00 1.00 1.00 500
6 1.00 0.99 1.00 500
7 0.99 1.00 0.99 500
8 0.99 1.00 1.00 500
9 0.99 0.98 0.99 500
10 0.99 1.00 1.00 500
accuracy 0.99 5000
macro avg 0.99 0.99 0.99 5000
weighted avg 0.99 0.99 0.99 5000
1.6 可视化隐藏层
hidden_layer=thetafinal1[:,1:]
hidden_layer.shape
(25, 400)
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(12, 12))
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(np.array(hidden_layer[5 * r + c].reshape((20, 20))),cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
2 总结
- minimize函数jac相关取值意义
jac:{callable,‘2-point’,‘3-point’,cs’,bool},可选
计算梯度向量的方法。仅适用于CG、BFGS、Newton CG、L-BFGS-B、TNC、SLSQP、dogleg、trust ncg、trust krylov、trust exact和trust Construct。
如果它是可调用的,它应该是一个返回梯度向量的函数: jac(x,*args)->array_like,shape(n,).其中x是(n,)的数组,args是具有固定参数的元组。
如果jac是布尔值且为真,则假定fun返回objective和gradient作为(f,g)元组。“Newton CG”、“trust ncg”、“dogleg”、“trust exact”和“trust krylov”方法要求提供可调用函数,或者fun返回目标和梯度。
如果None或False,则使用绝对步长的两点有限差分估计来估计梯度。或者,关键字{‘2-point’、‘3-point’、‘cs’}可用于选择具有相对步长的梯度的数值估计的有限差分格式。这些有限差分格式服从任何指定的边界。
- 可以进行梯度检测来判断神经网络是否正确
- OneHotEncoder(sparse=False)#False返回array对象,True返回稀疏矩阵