1、对数组进行一维和多维变化
# 将多维参数数组映射到一个向量上
def serializer(thetas):
res = np.array([0])
for t in thetas:
res = np.concatenate((res, t.ravel()), axis=0)
return res[1:]
# 将向量还原为多个参数
def deserialize(theta):
return theta[:25*401].reshape(25, 401), theta[25*401:].reshape(10, 26)
2、将y向量化
def convert(y):
n = len(np.unique(y))
res = False
for i in y:
temp = np.zeros((1, n))
temp[0][i[0] % 10] = 1
if type(res) == bool:
res = temp
else:
res = np.concatenate((res, temp), axis=0)
return res
3、前向传播并计算cost
def sigmoid(z):
return 1/(1+np.exp(-z))
def feedforword(thetas, X):
A, Z = [], []
a = X
for t in deserialize(thetas):
a = np.insert(a, 0, 1, axis=1)
A.append(a)
z = a.dot(t.T)
Z.append(z)
a = sigmoid(z)
A.append(a)
return A, Z
# 非正则化的损失值
def not_regularized_cost(thetas, X, y):
for t in deserialize(thetas):
X = np.insert(X, 0, 1, axis=1)
X = sigmoid(X.dot(t.T))
return np.mean(np.sum((-y)*np.log(X)-(1-y)*np.log(1-X), axis=1))
# 正则化的损失值
def regularized_cost(thetas, X, y, lamda):
m = X.shape[0]
part2 = 0
for t in deserialize(thetas):
X = np.insert(X, 0, 1, axis=1)
X = sigmoid(X.dot(t.T))
t = t[:, 1:] # 去掉bias unit
part2 += (lamda/(2*m))*np.sum(t*t)
part1 = np.mean(np.sum((-y)*np.log(X)-(1-y)*np.log(1-X), axis=1))
return part1 + part2
输出结果
theta = sio.loadmat(文件路径)
data = sio.loadmat(文件路径)
y = convert(data['y'])
# 训练集中对y的处理是 1 2 3 ... 0
# convert处理中是 0 1 2 ... 9
y0 = y[:, 0].reshape(y.shape[0], 1)
y = np.concatenate((y[:, 1:], y0), axis=1) # (5000,10)
X = data["X"] # (5000,400)
theta1 = theta["Theta1"] # shape(25,401)
theta2 = theta["Theta2"] # shape(10,26)
theta = serializer((theta1, theta2)) # 一维数组
a1 = X
a1 = np.insert(a1, 0, 1, axis=1) # (5000,401)
a2 = sigmoid(a1.dot(theta1.T)) # (5000,25)
a2 = np.insert(a2, 0, 1, axis=1) # (5000,26)
a3 = sigmoid(a2.dot(theta2.T)) # (5000,10)
a = feedforword(theta, X)
cost = np.mean(np.sum((-y)*np.log(a3)-(1-y)*np.log(1-a3), axis=1))
print(cost) # 0.2876291651613189
print(not_regularized_cost(theta, X, y)) # 0.2876291651613189
print(regularized_cost(theta, X, y, 1)) # 0.38376985909092365
4、反向传播
# sigmoid梯度
def sigmoid_gradient(z):
return sigmoid(z) * (1-sigmoid(z))
# 随机初始化参数, 范围[-e, e]
def random_initialize_weights(shape, e=0.12):
return (np.random.rand(shape[0], shape[1])-0.5) * 2 * e
# 反向传播算法
def back(thetas, X, y, lamda):
A, Z = feedforword(thetas, X)
a1, a2, a3 = A # a1(5000,401) a2(5000,26) a3(5000,10)
z2, z3 = Z # z2(5000,25) z3(5000,10)
theta1, theta2 = deserialize(theta) # theta1(25,401) theta2(10,26)
m = X.shape[0]
d3 = a3 - y # (5000,10)
d2 = d3.dot(theta2)[:, 1:] * sigmoid_gradient(z2)
theta1 = np.insert(np.delete(theta1, 0, axis=1), 0, 0, axis=1)
theta2 = np.insert(np.delete(theta2, 0, axis=1), 0, 0, axis=1)
D1 = (1/m) * d2.T.dot(a1) + (1/m) * theta1 # (25, 401)
D2 = (1/m) * d3.T.dot(a2) + (1/m) * theta2 # (10, 26)
return serializer((D1, D2))
代入数据
print(sigmoid_gradient(0)) # 0.25
print(random_initialize_weights((2, 2)))
y = convert(data['y'])
theta1 = random_initialize_weights((25, 401))
theta2 = random_initialize_weights((10, 26))
theta = serializer((theta1, theta2))
res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y, 1), method="TNC", jac=back)
print(res)
print(res.x.shape) # 10285
theta1, theta2 = deserialize(res.x)
sio.savemat("parametersWeights.mat", {"theta1": theta1, "theta2": theta2})
得出结果
0.25
[[ 0.02190042 -0.01471704]
[ 0.09290001 0.02323642]]
fun: 0.635188914309726
jac: array([-5.57739695e-05, -1.65921829e-05, 1.77331961e-05, ...,
2.06364816e-04, -7.09894744e-04, -6.71933834e-04])
message: 'Converged (|f_n-f_(n-1)| ~= 0)'
nfev: 221
nit: 15
status: 1
success: True
x: array([ 0.14616376, 0.01840686, -0.02081726, ..., -5.62499118,
2.55922323, -3.19727727])
(10285,)
5、预测
# 利用训练好的参数进行预测
def predict(thetas, X):
a3 = feedforword(thetas, X)[0][-1]
p = np.zeros((1, 10))
for i in a3:
index = np.argmax(i)
temp = np.zeros((1, 10))
temp[0][index] = 1
p = np.concatenate((p, temp), axis=0)
return p[1:]
print(classification_report(y, predict(res.x, X)))
输出结果
precision recall f1-score support
0 0.96 0.98 0.97 500
1 0.96 0.96 0.96 500
2 0.93 0.92 0.93 500
3 0.93 0.90 0.92 500
4 0.95 0.96 0.96 500
5 0.93 0.92 0.93 500
6 0.96 0.97 0.97 500
7 0.94 0.92 0.93 500
8 0.92 0.94 0.93 500
9 0.94 0.93 0.93 500
micro avg 0.94 0.94 0.94 5000
macro avg 0.94 0.94 0.94 5000
weighted avg 0.94 0.94 0.94 5000
samples avg 0.94 0.94 0.94 5000
6、可视化
# 将若干张图片组成一张图片
def mapping(images, images_dimension):
image_dimension = int(np.sqrt(images.shape[-1]))
image = False
im = False
for i in images:
if type(image) == bool:
image = i.reshape(image_dimension, image_dimension)
else:
if image.shape[-1] == image_dimension * images_dimension:
if type(im) == bool:
im = image
else:
im = np.concatenate((im, image), axis=0)
image = i.reshape(image_dimension, image_dimension)
else:
image = np.concatenate((image, i.reshape(image_dimension, image_dimension)), axis=1)
return np.concatenate((im, image), axis=0) # 200*200数组
# 可视化隐藏层的输入输出
def visualizing_the_hidden_layer(theta, X):
A, _ = feedforword(theta, X)
a1, a2, a3 = A
input = a1[..., 1:][:25]
output = a2[..., 1:][:25]
input = mapping(input, 5)
output = mapping(output, 5)
plt.subplot(1, 2, 1)
plt.axis("off")
plt.imshow(input.T)
plt.title("hidden layer input")
plt.subplot(1, 2, 2)
plt.axis("off")
plt.imshow(output)
plt.title("hidden layer output")
plt.show()
visualizing_the_hidden_layer(res.x, X)