本代码全部采用向量化方式进行code,如下:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn.metrics import classification_report
from scipy.optimize import minimize
from sklearn.preprocessing import OneHotEncoder
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def sigmoid_gradient(z):
return np.multiply(sigmoid(z), (1 - sigmoid(z)))
def forward_propagate(X, theta1, theta2):
X = np.mat(X)
theta1 = np.mat(theta1)
theta2 = np.mat(theta2)
a1 = X
z2 = theta1 * X.T
a2 = np.insert(sigmoid(z2), 0, 1, axis=0)
z3 = theta2 * a2
h = sigmoid(z3)
# h这里是列向量
# h = np.argmax(a3, axis=0).T + 1
return z2, a2, z3, h
# 向量化计算代价函数
def cost(theta1, theta2, hidden_size, num_labels, X, y, learnRate):
X = np.mat(X)
y = np.mat(y)
theta1 = np.mat(theta1)
theta2 = np.mat(theta2)
z2, a2, z3, h = forward_propagate(X, theta1, theta2)
# 下面两个方案是如果将data['y']=[10,10...9,9...1]直接传入进来而没进行分类编码而需要的的操作
# 方案1 先初设一个零矩阵,其shape为10*5000,利用一个循环将相应的位置的值设为1
# 即如果y[i] = 2, 则第2列的向量设为[0,1,0,0....0]
# temp = np.mat(np.zeros((theta2.shape[0], X.shape[0])))
# for i in range(X.shape[0]):
# temp[y[i, 0] - 1, i] = 1
# 方案2 利用sklearn.preprocessing的OneHotEncoder进行编码
# encoder = OneHotEncoder(sparse=False)
# 这里y的类型应该设为np.array否则会有warning
# y = np.array(y)
# temp = encoder.fit_transform(y).T
# print(temp.shape)
# 这里完全用向量来计算代价,相比for循环开销小一些
# temp是向量化的y的值,其形状是10*5000,h是对X的预测矩阵,其形状是10*5000
# 对于h和temp的每一列做对应位的乘法得到10*5000的矩阵,最后对所有元素加和即为所需要的first和second
first = np.sum(np.multiply(-y, np.log(h)))
# 防止产生log(0)这种情况,给自变量加1e-5用来近似代表0
second = np.sum(np.multiply(1 - y, np.log(1 - h + 1e-5)))
# 正则化的时候别忘了对于所有的theta0不需要正则,虽然正则了也没太大区别,但是一般还是不正则
reg = (np.sum(np.power(theta1[:, 1:], 2)) + np.sum(np.power(theta2[:, 1:], 2))) * learnRate / (2 * X.shape[0])
return (first - second) / X.shape[0] + reg
# 反向传播算法
def backprop(params, hidden_size, num_labels, X, y, learnRate):
X = np.mat(X)
y = np.mat(y)
# params = np.mat(params).reshape(1, hidden_size * (input_size + 1) + num_labels * (hidden_size + 1))
# 生成随机初始化的theta1和theta2
# theta1 = np.mat(params[:, :hidden_size * (input_size + 1)].reshape(hidden_size, input_size + 1))
# theta2 = np.mat(params[:, hidden_size * (input_size + 1):].reshape(num_labels, hidden_size + 1))
theta1 = np.mat(params[:hidden_size * (input_size + 1)].reshape(hidden_size, input_size + 1))
theta2 = np.mat(params[hidden_size * (input_size + 1):].reshape(num_labels, hidden_size + 1))
# print(theta1.shape, theta2.shape)
z2, a2, z3, h = forward_propagate(X, theta1, theta2)
z2 = np.mat(np.insert(z2, 0, 1, axis=0))
a2 = np.mat(a2)
z3 = np.mat(z3)
h = np.mat(h)
# 计算代价
first = np.sum(np.multiply(-y, np.log(h)))
second = np.sum(np.multiply(1 - y, np.log(1 - h + 1e-5)))
reg = (np.sum(np.power(theta1[:, 1:], 2)) + np.sum(np.power(theta2[:, 1:], 2))) * learnRate / (2 * X.shape[0])
J = (first - second) / X.shape[0] + reg
# 反向传播
dt3 = h - y # 10*5000
dt2 = np.multiply(theta2.T * dt3, sigmoid_gradient(z2)) # 26*5000
# print(dt3.shape, a2.shape)
# 先获得原始的delta,然后再对需要正则化的部分进行额外处理
delta1 = np.dot(dt2[1:, :], X) / X.shape[0]
delta1[:, 1:] += (theta1[:, 1:] * learnRate / X.shape[0])
delta2 = np.dot(dt3, a2.T) / X.shape[0]
delta2[:, 1:] += (theta2[:, 1:] * learnRate / X.shape[0])
# print(delta1.shape, delta2.shape)
grad = np.concatenate((np.ravel(delta1), np.ravel(delta2)))
# print(grad.shape)
# print(params.shape)
return J, grad
if __name__ == '__main__':
data = loadmat('ex4data1.mat')
# print(data)
# X是5000*401的矩阵,y是5000*1的矩阵
y_true = data['y']
encoder = OneHotEncoder(sparse=False)
X = np.mat(np.insert(data['X'], 0, 1, axis=1))
y = np.mat(encoder.fit_transform(data['y']).T)
# print(y)
# print(X.shape, y.shape)
weight = loadmat('ex4weights.mat')
# theta1是25*401的矩阵,theta2是10*26的矩阵
theta1 = np.mat(weight['Theta1'])
theta2 = np.mat(weight['Theta2'])
# print(theta1.shape, theta2.shape)
# print(theta1, theta2)
# 展示一下随机取得的100张样本图片
# sample_index = np.random.choice(np.arange(X.shape[0]), 100, replace=False)
# sample = data['X'][sample_index, :]
# print(sample.shape)
# 展示一百张样本图
# fig, ax = plt.subplots(nrows=10, ncols=10, sharex=True, sharey=True, figsize=(12, 12))
# for i in range(10):
# for j in range(10):
# ax[i, j].matshow(sample[10 * i + j].reshape(20, 20).T, cmap=plt.cm.binary)
# plt.xticks(np.array([]))
# plt.yticks(np.array([]))
# plt.show()
# z1 = theta1 * X.T
# a1 = np.insert(sigmoid(z1), 0, 1, axis=0)
# z2 = theta2 * a1
# a2 = sigmoid(z2)
# print(a2.shape)
# predict = np.argmax(a2, axis=0).T + 1
# print(predict.shape)
# print(classification_report(y, predict))
hidden_size = 25
num_labels = 10
input_size = 400
learnRate = 1
# print(cost(theta1, theta2, hidden_size, num_labels, X, y, learnRate))
# print(sigmoid_gradient(0))
# a = np.mat([[1,2,3],
# [4,5,6],
# [1,2,1]])
# b = np.mat([[1],
# [2],
# [3]])
# print(np.multiply(a, b))
params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.24
# print(params.shape[0])
fmin = minimize(fun=backprop, x0=params, args=(hidden_size, num_labels, X, y, learnRate),
method='TNC', jac=True, options={'maxiter': 250})
theta1_f = np.mat(fmin.x[: hidden_size * (input_size + 1)]).reshape(hidden_size, input_size + 1)
theta2_f = np.mat(fmin.x[hidden_size * (input_size + 1):]).reshape(num_labels, hidden_size + 1)
# print(theta1_f.shape, theta2_f.shape)
z2, a2, z3, h = forward_propagate(X, theta1_f, theta2_f)
# print(h.shape)
y_pred = np.argmax(h, axis=0).T + 1
print(classification_report(y_true, y_pred))
有一处需要注意,优化函数minimize里的x0需要是一个向量,因此要将神经网络的theta1和theta2两个权重矩阵展开成向量然后再拼接在一起才能使用。