简介
本文是吴恩达《机器学习》习题四的python实现。习题四要求我们自己实现神经网络来对数字图片进行识别,本质上是一个多分类问题。
代码
这部分和ex3里是一样的,5000张20*20像素的手写数字数据集,以及对应的数字(1-9,0对应10)
# import libs
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import matplotlib
from sklearn.preprocessing import OneHotEncoder
# load and plot data
data = loadmat('../data_sets/ex4data1.mat')
sample_idx = np.random.choice(np.arange(data['X'].shape[0]), 100)
sample_images = data['X'][sample_idx, :]
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(12, 12))
for r in range(10):
for c in range(10):
ax_array[r, c].matshow(np.array(sample_images[10 * r + c].reshape((20, 20))).T,cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
# active function, use sigmoid here, feel free to use others such as tanh, relu
def sigmoid(x):
return 1/(1+np.exp(-x))
# define NN forward function, here our NN has two layers
def forward(theta1,theta2,X):
theta1 = np.matrix(theta1)
theta2 = np.matrix(theta2)
X = np.insert(X,0,np.ones(X.shape[0]),axis=1)
X = np.matrix(X)
A0 = X
Z1 = X * theta1.T
A1 = sigmoid(Z1)
A1 = np.insert(A1,0,np.ones(A1.shape[0]),axis=1)
A1 = np.matrix(A1)
Z2 = A1 * theta2.T
A2 = sigmoid(Z2)
return A0,Z1,A1,Z2,A2
# define cost function which we gonna try to minimize
def cost(theta1,theta2,X,y,lambd):
A0,Z1,A1,Z2,y_hat = forward(theta1,theta2,X)
first = np.multiply(-y,np.log(y_hat))
second = np.multiply(1-y,np.log(1-y_hat))
J=(first-second)
J = J/len(X)
J = np.sum(J)
reg = float(lambd) * (np.sum(np.power(theta1[:,1:],2))+ np.sum(np.power(theta2[:,1:],2))) /(2* len(X))
J+=reg
return J
data['X'].shape,data['y'].shape
# ((5000, 400), (5000, 1))
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(data['y'])
y_onehot.shape
# (5000, 10)
cost(theta1,theta2,data['X'],y_onehot,1)
# 3 0.3837698590909236
# derivative of sigmoid function
def d_sigmoid(x):
return np.multiply(sigmoid(x),(1-sigmoid(x)))
# backward
def back(params,hidden,label,input,X,y,lambd):
theta1 = np.matrix(np.reshape(params[:hidden * (input + 1)], (hidden, (input + 1))))
theta2 = np.matrix(np.reshape(params[hidden * (input + 1):], (label, (hidden + 1))))
A0,Z1,A1,Z2,y_pred = forward(theta1,theta2,X)
J = cost(theta1,theta2,X,y,lambd)
DZ2 = y_pred-y
Z1 = np.insert(Z1,0,values=np.ones(len(X)),axis=1)
DZ1 = np.multiply(DZ2 * theta2, d_sigmoid(Z1))
DT2 = (DZ2.T * A1)/len(X)
DT1 = (DZ1[:,1:].T * A0)/len(X)
DT2[:,1:] = DT2[:,1:] + lambd * theta2[:,1:]/len(X)
DT1[:,1:] = DT1[:,1:] + lambd * theta1[:,1:]/len(X)
grad = np.concatenate((np.ravel(DT1),np.ravel(DT2)))
return J,grad
# get params ready
hidden = 25
input = 400
label = 10
# init thetas randomly
params = (np.random.random(size=hidden * (input + 1) + label * (hidden + 1)) - 0.5) * 0.24
# learning rate is 1
J,grad = back(params,hidden,label,input,data['X'],y_onehot,1)
# it is necessary to do grad check to make sure our implementation is correct
def get_approx_grad(params,change,hidden,label,input,X,y,alpha):
grad = np.zeros(len(params))
for i in range(len(params)):
params[i]+=change
theta1 = np.matrix(np.reshape(params[:hidden * (input + 1)], (hidden, (input + 1))))
theta2 = np.matrix(np.reshape(params[hidden * (input + 1):], (label, (hidden + 1))))
J_up = cost(theta1,theta2,X,y,alpha)
params[i]-=2*change
theta1 = np.matrix(np.reshape(params[:hidden * (input + 1)], (hidden, (input + 1))))
theta2 = np.matrix(np.reshape(params[hidden * (input + 1):], (label, (hidden + 1))))
J_down = cost(theta1,theta2,X,y,alpha)
grad[i] = (J_up - J_down)/(2*change)
params[i] += change
return grad
grad_approx = get_approx_grad(params,0.001,hidden,label,input,data['X'],y_onehot,1)
grad_varient = np.sqrt(np.sum(np.power(grad_approx-grad,2)))
grad_varient
# 3.2421633421969986e-08
# fit model
from scipy.optimize import minimize
# minimize the objective function
fmin = minimize(fun=back, x0=(params), args=(hidden, label, input, data['X'], y_onehot, 1),
method='TNC', jac=True,options={'maxfun': 250})
fmin
# message: Max. number of function evaluations reached
# success: False
# status: 3
# fun: 0.3256656137323782
# x: [-2.727e+00 3.072e-03 ... -6.346e-01 -3.700e+00]
# nit: 22
# jac: [-9.848e-05 6.145e-07 ... 1.824e-05 1.513e-05]
# nfev: 250
# reshape result
X = np.matrix(data=['X'])
thetafinal1 = np.matrix(np.reshape(fmin.x[:hidden * (input + 1)], (hidden, (input + 1))))
thetafinal2 = np.matrix(np.reshape(fmin.x[hidden * (input + 1):], (label, (hidden + 1))))
# model performance
a1, z2, a2, z3, h = forward( thetafinal1, thetafinal2,data['X'])
y_pred = np.array(np.argmax(h, axis=1) + 1)
from sklearn.metrics import classification_report
print(classification_report(data['y'], y_pred))
# precision recall f1-score support
#
# 1 0.99 1.00 1.00 500
# 2 0.99 0.99 0.99 500
# 3 0.99 0.98 0.99 500
# 4 1.00 0.99 0.99 500
# 5 1.00 0.99 1.00 500
# 6 1.00 1.00 1.00 500
# 7 0.99 1.00 0.99 500
# 8 1.00 1.00 1.00 500
# 9 0.99 0.99 0.99 500
# 10 0.99 1.00 1.00 500
#
# accuracy 0.99 5000
# macro avg 0.99 0.99 0.99 5000
#weighted avg 0.99 0.99 0.99 5000
数据集
链接: https://pan.baidu.com/s/1zteJBsMJ0GRwqRb5opOgwg 提取码: 78ah