开始完全整蒙了,还是半夜头脑清醒了。按着公式一点点打出来了,由于网上也没找到对照的也不知道对不对。
首先搬来自己写的多层神经网络
import scipy.io as scio
import numpy as np
import matplotlib.pyplot as plt
def init(x):
first_num = 10
second_num = 7
third_num = 3
np.random.seed(5)
w1 = np.random.randn(first_num, x.shape[0]) * np.sqrt(2 / x.shape[0])
b1 = np.zeros((first_num, 1))
w2 = np.random.randn(second_num, first_num) * np.sqrt(2 / first_num)
b2 = np.zeros((second_num, 1))
w3 = np.random.randn(third_num, second_num) * np.sqrt(2 / second_num)
b3 = np.zeros((third_num, 1))
w4 = np.random.randn(1, third_num) * np.sqrt(2 / third_num)
b4 = np.zeros((1, 1))
ini_param = {
"w1": w1,
"b1": b1,
"w2": w2,
"b2": b2,
"w3": w3,
"b3": b3,
"w4": w4,
"b4": b4
}
return ini_param
# 构建前向
def cal_z(w, a, b): # a维度:上层节点数*例子数 w维度:该层节点数*上层节点数
return np.dot(w, a) + b
def cal_sigma(z): # z维度:该层节点数*例子数
return 1 / (1 + np.exp(-z))
def cal_tan_h(z): # z1维度hidden_num*x.shape[1]
return np.tanh(z)
def cal_relu(z): # z维度:该层节点数*例子数
return np.maximum(0, z)
def forward_f(x, p):
z1 = cal_z(p["w1"], x, p["b1"])
a1 = cal_relu(z1)
z2 = cal_z(p["w2"], a1, p["b2"])
a2 = cal_relu(z2)
z3 = cal_z(p["w3"], a2, p["b3"])
a3 = cal_relu(z3)
z4 = cal_z(p["w4"], a3, p["b4"])
a4 = cal_sigma(z4)
forward_param = {
"z1": z1,
"a1": a1,
"z2": z2,
"a2": a2,
"z3": z3,
"a3": a3,
"z4": z4,
"a4": a4
}
return forward_param
# 计算损失函数
def cost_f(a, y): # a维度: 1*例子数 y维度:1*例子数
m = y.shape[1]
return -np.sum(y * np.log(a) + (1 - y) * np.log(1 - a)) / m
# 后向
def cal_dz_last(a, y): # ai维度:第i层节点数*例子数 y维度:1*例子数
return a - y
def cal_dw_db(dz, a, m): # dzi维度:第i层节点数*例子数 ai维度:第i层节点数*例子数
return np.dot(dz, a.T) / m, np.sum(dz, axis=1, keepdims=True) / m
def cal_da(dz, w): # dzi维度:第i层节点数*例子数 wi维度:第i层节点数*第i-1层节点数
return np.dot(w.T, dz)
def cal_drelu(da, z): # dai维度:第i层节点数*例子数 zi维度:第i层节点数*例子数
t = np.ones(z.shape)
t[z <= 0] = 0
return da * t
def back_f(p, f_p, x, y):
dz4 = cal_dz_last(f_p["a4"], y)
dw4, db4 = cal_dw_db(dz4, f_p["a3"], y.shape[1])
da3 = cal_da(dz4, p["w4"])
dz3 = cal_drelu(da3, f_p["z3"])
dw3, db3 = cal_dw_db(dz3, f_p["a2"], y.shape[1])
da2 = cal_da(dz3, p["w3"])
dz2 = cal_drelu(da2, f_p["z2"])
dw2, db2 = cal_dw_db(dz2, f_p["a1"], y.shape[1])
da1 = cal_da(dz2, p["w2"])
dz1 = cal_drelu(da1, f_p["z1"])
dw1, db1 = cal_dw_db(dz1, x, y.shape[1])
back_param = {
"dw1": dw1,
"db1": db1,
"dw2": dw2,
"db2": db2,
"dw3": dw3,
"db3": db3,
"dw4": dw4,
"db4": db4
}
return back_param
# 更新参数
def update_p(p, b_p, learning_rate):
upd_p = {
"w1": p["w1"] - learning_rate * b_p["dw1"],
"b1": p["b1"] - learning_rate * b_p["db1"],
"w2": p["w2"] - learning_rate * b_p["dw2"],
"b2": p["b2"] - learning_rate * b_p["db2"],
"w3": p["w3"] - learning_rate * b_p["dw3"],
"b3": p["b3"] - learning_rate * b_p["db3"],
"w4": p["w4"] - learning_rate * b_p["dw4"],
"b4": p["b4"] - learning_rate * b_p["db4"]
}
return upd_p
下面我按照梯度检查公式写的,想了一下午用向量化实现计算,以失败告终。只能以最笨的循环实现:
# 梯度检查
def cal_dcost(x, y, p, theta, str):
dt = []
for i in range(p[str].shape[0]):
for j in range(p[str].shape[1]):
p[str][i, j] = p[str][i, j] + theta
p_plust = forward_f(x, p)
p[str][i, j] = p[str][i, j] - 2 * theta
p_subt = forward_f(x, p)
p[str][i, j] = p[str][i, j] + theta
dapprox = (cost_f(p_plust["a4"], y) - cost_f(p_subt["a4"], y)) / (2 * theta)
# if dapprox==0:
# print(str,i,j)
dt.append(dapprox)
return dt
def grad_check(x, y, p, b_p, theta=0.0000001):
theta_arr = np.ones((1, 1))
for i in p.keys():
t = cal_dcost(x, y, p, theta, i)
t = np.array(t)
t = t.reshape(-1, 1)
theta_arr = np.append(theta_arr, t, axis=0)
dtheta_arr = np.ones((1, 1))
for i in b_p.keys():
t = np.copy(b_p[i])
t = t.reshape(-1, 1)
dtheta_arr = np.append(dtheta_arr, t, axis=0)
return np.linalg.norm(theta_arr - dtheta_arr, axis=1) / (
np.linalg.norm(theta_arr, axis=1) + np.linalg.norm(dtheta_arr, axis=1))
建模,绘图
# 建模
def model(x, y, learning_rate, loop_num):
p = init(x)
cost_t = []
for i in range(loop_num):
f_p = forward_f(x, p)
b_p = back_f(p, f_p, x, y)
p = update_p(p, b_p, learning_rate)
print(grad_check(x, y, p, b_p))
if i % 1 == 0:
cost_t.append(cost_f(f_p["a4"], y))
return p, cost_t
# 至此模型搭建完成
def print_figure(x, y, final_p, _cost):
x_min, x_max = x[0, :].min() - 1, x[0, :].max() + 1
y_min, y_max = x[1, :].min() - 1, x[1, :].max() + 1
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole grid
z = forward_f(np.vstack((xx.ravel(), yy.ravel())), final_p)
z = np.round(z["a4"]).reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, z, cmap=plt.cm.Spectral)
plt.ylabel('x2')
plt.xlabel('x1')
plt.scatter(x[0, :], x[1, :], c=y, cmap=plt.cm.Spectral)
plt.figure()
plt.plot(_cost)
plt.show()
这边读取mat数据作业里没给,需要自己读一下
data_path = r".\datasets\data.mat"
data = scio.loadmat(data_path)
print(data.keys())
x_train = data.get('X')
y_train = data.get('y')
x_test = data.get("Xval")
y_test = data.get("yval")
print(x_train.shape)
print(y_train.shape)
model_p, cost = model(x_train.T, y_train.T, 0.001, 1)
forward_f(x_train.T, model_p)
print_figure(x_train.T, y_train.T, model_p, cost)
结果有报错,RuntimeWarning: invalid value encountered in true_divide
np.linalg.norm(theta_arr, axis=1) + np.linalg.norm(dtheta_arr, axis=1))
意思除数有0,我检查了这些数,两个导数确实都是0。这几个节点是不是在梯度下降过程中没有发挥作用。
老吴在课上说10^-7是完美,10^-4~在10^-5也没有大问题,我就算勉强过关了,不想在烧脑了,计算机真是让人头秃。