网络图和激活函数
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
准备数据
def load_data():
# 训练样本有300个 测试样本有100个
train_X,train_Y = make_circles(n_samples=300,noise=.05)
test_X,test_Y = make_circles(n_samples=100,noise=.05)
# 可视化数据
plt.scatter(train_X[:,0],train_X[:,1],c = train_Y,s = 40,cmap=plt.cm.Spectral)
train_X = train_X.T
train_Y = train_Y.reshape((1,train_Y.shape[0]))
test_X = test_X.T
test_Y = test_Y.reshape((1,test_Y.shape[0]))
return train_X,train_Y,test_X,test_Y
train_X,train_Y,test_X,test_Y = load_data()
print(train_X,train_Y,test_X,test_Y)
几个激活函数
s
i
g
m
o
i
d
(
z
)
=
1
1
+
e
−
z
sigmoid(z) = \frac{1}{1+e^{-z}}
sigmoid(z)=1+e−z1
t
a
n
h
(
z
)
=
e
z
−
e
−
z
e
z
+
e
−
z
tanh(z) = \frac{e^z-e^{-z}}{e^z+e^{-z}}
tanh(z)=ez+e−zez−e−z
r
e
l
u
(
z
)
=
m
a
x
(
z
,
0
)
relu(z) = max(z,0)
relu(z)=max(z,0)
def sigmoid(z):
return 1./(1+np.exp(-z))
def tanh(z):
return (np.exp(z)-np.exp(-z)) / (np.exp(z)+np.exp(-z))
def relu(z):
return (np.abs(z)+z)/2
x = np.linspace(-5,5,100)
y = sigmoid(x)
plt.plot(x,y)
plt.show()
y = tanh(x)
plt.plot(x,y)
plt.show()
y = relu(x)
plt.plot(x,y)
plt.show()
sigmoid函数的导函数:
z
′
(
x
)
=
s
(
x
)
∗
(
1
−
s
(
x
)
)
z^{\prime}(x) = s(x)*(1-s(x))
z′(x)=s(x)∗(1−s(x))
RELU函数的导数:
r
e
l
u
′
(
z
)
=
{
1
,
r
e
l
u
(
x
)
>
0
0
,
r
e
l
u
(
z
)
=
0
relu^{\prime}(z)=\left\{\begin{array}{ll} {1,} & {relu(x)>0} \\ {0,} & {relu(z) = 0} \end{array}\right.
relu′(z)={1,0,relu(x)>0relu(z)=0
前向传播的过程
首先对于每一组数据来说都有一个输入
[
x
1
x
2
]
(
2
,
1
)
\left[\begin{array}{l} x_{1} \\ x_{2} \end{array}\right]_{(2,1)}
[x1x2](2,1)
然后第一个隐藏层我们称之为
H
1
H_1
H1,其hidden_size为20,将还没有经过激活函数的RELU的,称之为
Z
1
Z_1
Z1,那么我们马上有:
Z
1
=
W
1
X
+
b
1
Z_1 = W_1X+b_1
Z1=W1X+b1
H
1
=
R
E
L
U
(
Z
1
)
H_1 = RELU(Z_1)
H1=RELU(Z1)
其中
W
1
W_1
W1是从输入层到第一个隐藏层的参数,
b
1
b_1
b1是其bias项。
在这里面我们要十分注意维度的变换,W1的维度是(20,2),
b
1
b_1
b1为(20,1),自然
Z
1
Z_1
Z1是(20,1),同样的
H
1
H_1
H1也是(20,1),他的过程大致是
Z
[
1
]
=
W
X
+
b
[
1
]
=
[
w
1
,
1
[
1
]
w
1
,
2
[
1
]
w
2
,
1
[
1
]
w
2
,
2
[
1
]
⋮
⋮
w
20
,
1
[
1
]
w
20
,
2
[
1
]
]
(
20
,
2
)
×
[
x
1
x
2
]
(
2
,
1
)
+
[
b
1
[
1
]
b
2
[
1
]
⋮
b
20
[
1
]
]
(
20
,
1
)
Z^{[1]}=W X+b^{[1]}=\left[\begin{array}{cc} w_{1,1}^{[1]} & w_{1,2}^{[1]} \\ w_{2,1}^{[1]} & w_{2,2}^{[1]} \\ \vdots & \vdots \\ w_{20,1}^{[1]} & w_{20,2}^{[1]} \end{array}\right]_{(20,2)} \times\left[\begin{array}{c} x_{1} \\ x_{2} \end{array}\right]_{(2,1)}+\left[\begin{array}{c} b_{1}^{[1]} \\ b_{2}^{[1]} \\ \vdots \\ b_{20}^{[1]} \end{array}\right]_{(20,1)}
Z[1]=WX+b[1]=⎣⎢⎢⎢⎢⎡w1,1[1]w2,1[1]⋮w20,1[1]w1,2[1]w2,2[1]⋮w20,2[1]⎦⎥⎥⎥⎥⎤(20,2)×[x1x2](2,1)+⎣⎢⎢⎢⎢⎡b1[1]b2[1]⋮b20[1]⎦⎥⎥⎥⎥⎤(20,1)
当然对于第二个隐藏层,它的hidden_size是5,那么它的前向传播的一系列东西如下
Z
2
=
W
2
H
1
+
b
2
Z_2 = W_2H_1 + b_2
Z2=W2H1+b2
H
2
=
R
E
L
U
(
Z
2
)
H_2 = RELU(Z_2)
H2=RELU(Z2)
其各个维度对应如下
H
1
:
(
20
,
1
)
,
W
2
:
(
5
,
20
)
,
b
2
:
(
5
,
1
)
,
Z
2
:
(
5
,
1
)
,
H
2
:
(
5
,
1
)
H_1:(20,1),W_2:(5,20),b_2:(5,1),Z_2:(5,1),H_2:(5,1)
H1:(20,1),W2:(5,20),b2:(5,1),Z2:(5,1),H2:(5,1)
到了我们的最后一层
Z
3
=
W
3
H
2
+
b
3
Z_3 = W_3H_2+b_3
Z3=W3H2+b3
y
^
=
s
i
g
m
o
i
d
(
Z
3
)
\hat{y} = sigmoid(Z_3)
y^=sigmoid(Z3)
其各个维度对应如下
H
2
:
(
5
,
1
)
,
W
3
:
(
5
,
1
)
,
b
3
:
(
1
)
,
H
3
:
(
1
)
H_2:(5,1),W_3:(5,1),b_3:(1),H_3:(1)
H2:(5,1),W3:(5,1),b3:(1),H3:(1)
然后我们得出一个标量值,这个值根据我们设定的阈值,比如如果大于0.5,我们把它归为1,反之为0,就完成了我们的分类,也代表我们整个前向传播的过程结束了
def initialize_parameters(layer_dims):
# 初始化权重和偏置项
Weight = {}
bias = {}
for d in range(1,len(layer_dims)):
Weight['W'+str(d)] = np.random.randn(layer_dims[d],layer_dims[d-1]) / np.sqrt(layer_dims[d-1])
bias['b'+str(d)] = np.zeros((layer_dims[d],1))
return Weight,bias
# 设定每一层的 hidden_size
layer_dims = [2,20,5,1] # 两个隐藏层的大小 20 5
W,b = initialize_parameters(layer_dims)
W["W3"].shape
# 输出w3的维度 (1,5)
def forward_propagation(X,Weight,bias,activation):
# 前向传播函数
Z = {}
H = {}
# 一般吧输入X看做第0层的输出 H0
H['HO'] = X
L = len(Weight)
for l in range(1,L=1):
Z['Z'+str(l)] = np.dot(Weight['W'+str(l)],H['H'+str(l-1)] + bias['b'+str(l)])
exec("H['H'+str(l)] = "+activation[l-1]+"(Z['Z'+str(l)])")
return H,Z
损失函数选用
选用的是交叉熵损失函数(cross entropy ,CE)
L
(
y
^
(
i
)
,
y
(
i
)
)
=
−
(
y
(
i
)
log
(
y
^
(
i
)
)
+
(
1
−
y
(
i
)
)
log
(
1
−
y
^
(
i
)
)
)
L\left(\hat{y}^{(i)}, y^{(i)}\right)=-\left(y^{(i)} \log \left(\hat{y}^{(i)}\right)+\left(1-{y}^{(i)}\right) \log \left(1-\hat{y}^{(i)}\right)\right)
L(y^(i),y(i))=−(y(i)log(y^(i))+(1−y(i))log(1−y^(i)))
对整个样本来说,我们的损失函数可以这么定义:
J
(
w
,
b
)
=
1
m
∑
i
=
1
m
L
(
y
^
(
i
)
,
y
(
i
)
)
=
−
1
m
∑
i
=
1
m
[
y
(
i
)
log
(
y
^
(
i
)
)
+
(
1
−
y
(
i
)
)
log
(
1
−
y
^
(
i
)
)
]
+
λ
2
m
∥
w
∥
F
2
J(w, b)=\frac{1}{m} \sum_{i=1}^{m} L\left(\hat{y}^{(i)}, y^{(i)}\right)=-\frac{1}{m} \sum_{i=1}^{m}\left[y^{(i)} \log \left(\hat{y}^{(i)}\right)+\left(1-y^{(i)}\right) \log \left(1-\hat{y}^{(i)}\right)\right]+\frac{\lambda}{2 m}\|w\|_{F}^{2}
J(w,b)=m1i=1∑mL(y^(i),y(i))=−m1i=1∑m[y(i)log(y^(i))+(1−y(i))log(1−y^(i))]+2mλ∥w∥F2
我们在损失函数的最后加了一个L2正则式,进行一个权重的衰减,防止过拟合
import numpy as np
def compute_cost(H,Y,Weight,lambd = 0.7):
m = Y.shape[1]
L2_term = 0
for key in Weight.keys():
L2_term += (np.sum(np.square(Weight[key])))
logprobs = np.multiarray(-np.log(H['H'+str(len(H)-1)]),Y)+np.multiarray(-np.log(1-H['H'+str(len(H)-1)]),1-Y)
# 在训练的时候我们是一批数据进行迭代,最后取其总和的平均值
cost = 1./m*np.nansum(logprobs)
cost += L2_term * lambd / (2*m)
return cost