算法强化 —— 前向神经网络

网络图和激活函数

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles

准备数据

def load_data():
    # 训练样本有300个 测试样本有100个
    train_X,train_Y = make_circles(n_samples=300,noise=.05)
    test_X,test_Y = make_circles(n_samples=100,noise=.05)
    # 可视化数据
    plt.scatter(train_X[:,0],train_X[:,1],c = train_Y,s = 40,cmap=plt.cm.Spectral)
    train_X = train_X.T
    train_Y = train_Y.reshape((1,train_Y.shape[0]))
    test_X = test_X.T
    test_Y = test_Y.reshape((1,test_Y.shape[0]))
    return train_X,train_Y,test_X,test_Y

train_X,train_Y,test_X,test_Y = load_data()

print(train_X,train_Y,test_X,test_Y)

几个激活函数

s i g m o i d ( z ) = 1 1 + e − z sigmoid(z) = \frac{1}{1+e^{-z}} sigmoid(z)=1+ez1
t a n h ( z ) = e z − e − z e z + e − z tanh(z) = \frac{e^z-e^{-z}}{e^z+e^{-z}} tanh(z)=ez+ezezez
r e l u ( z ) = m a x ( z , 0 ) relu(z) = max(z,0) relu(z)=max(z,0)

def sigmoid(z):
    return 1./(1+np.exp(-z))

def tanh(z):
    return (np.exp(z)-np.exp(-z)) / (np.exp(z)+np.exp(-z))

def relu(z):
    return (np.abs(z)+z)/2

x = np.linspace(-5,5,100)
y = sigmoid(x)
plt.plot(x,y)
plt.show()


y = tanh(x)
plt.plot(x,y)
plt.show()

y = relu(x)
plt.plot(x,y)
plt.show()

sigmoid函数的导函数: z ′ ( x ) = s ( x ) ∗ ( 1 − s ( x ) ) z^{\prime}(x) = s(x)*(1-s(x)) z(x)=s(x)(1s(x))
RELU函数的导数:
r e l u ′ ( z ) = { 1 , r e l u ( x ) > 0 0 , r e l u ( z ) = 0 relu^{\prime}(z)=\left\{\begin{array}{ll} {1,} & {relu(x)>0} \\ {0,} & {relu(z) = 0} \end{array}\right. relu(z)={1,0,relu(x)>0relu(z)=0

前向传播的过程

首先对于每一组数据来说都有一个输入 [ x 1 x 2 ] ( 2 , 1 ) \left[\begin{array}{l} x_{1} \\ x_{2} \end{array}\right]_{(2,1)} [x1x2](2,1)
然后第一个隐藏层我们称之为 H 1 H_1 H1,其hidden_size为20,将还没有经过激活函数的RELU的,称之为 Z 1 Z_1 Z1,那么我们马上有:
Z 1 = W 1 X + b 1 Z_1 = W_1X+b_1 Z1=W1X+b1
H 1 = R E L U ( Z 1 ) H_1 = RELU(Z_1) H1=RELU(Z1)
其中 W 1 W_1 W1是从输入层到第一个隐藏层的参数, b 1 b_1 b1是其bias项。
在这里插入图片描述
在这里面我们要十分注意维度的变换,W1的维度是(20,2), b 1 b_1 b1为(20,1),自然 Z 1 Z_1 Z1是(20,1),同样的 H 1 H_1 H1也是(20,1),他的过程大致是
Z [ 1 ] = W X + b [ 1 ] = [ w 1 , 1 [ 1 ] w 1 , 2 [ 1 ] w 2 , 1 [ 1 ] w 2 , 2 [ 1 ] ⋮ ⋮ w 20 , 1 [ 1 ] w 20 , 2 [ 1 ] ] ( 20 , 2 ) × [ x 1 x 2 ] ( 2 , 1 ) + [ b 1 [ 1 ] b 2 [ 1 ] ⋮ b 20 [ 1 ] ] ( 20 , 1 ) Z^{[1]}=W X+b^{[1]}=\left[\begin{array}{cc} w_{1,1}^{[1]} & w_{1,2}^{[1]} \\ w_{2,1}^{[1]} & w_{2,2}^{[1]} \\ \vdots & \vdots \\ w_{20,1}^{[1]} & w_{20,2}^{[1]} \end{array}\right]_{(20,2)} \times\left[\begin{array}{c} x_{1} \\ x_{2} \end{array}\right]_{(2,1)}+\left[\begin{array}{c} b_{1}^{[1]} \\ b_{2}^{[1]} \\ \vdots \\ b_{20}^{[1]} \end{array}\right]_{(20,1)} Z[1]=WX+b[1]=w1,1[1]w2,1[1]w20,1[1]w1,2[1]w2,2[1]w20,2[1](20,2)×[x1x2](2,1)+b1[1]b2[1]b20[1](20,1)
当然对于第二个隐藏层,它的hidden_size是5,那么它的前向传播的一系列东西如下
Z 2 = W 2 H 1 + b 2 Z_2 = W_2H_1 + b_2 Z2=W2H1+b2
H 2 = R E L U ( Z 2 ) H_2 = RELU(Z_2) H2=RELU(Z2)
其各个维度对应如下 H 1 : ( 20 , 1 ) , W 2 : ( 5 , 20 ) , b 2 : ( 5 , 1 ) , Z 2 : ( 5 , 1 ) , H 2 : ( 5 , 1 ) H_1:(20,1),W_2:(5,20),b_2:(5,1),Z_2:(5,1),H_2:(5,1) H1:(20,1),W2:(5,20),b2:(5,1),Z2:(5,1),H2:(5,1)
到了我们的最后一层
Z 3 = W 3 H 2 + b 3 Z_3 = W_3H_2+b_3 Z3=W3H2+b3
y ^ = s i g m o i d ( Z 3 ) \hat{y} = sigmoid(Z_3) y^=sigmoid(Z3)
其各个维度对应如下 H 2 : ( 5 , 1 ) , W 3 : ( 5 , 1 ) , b 3 : ( 1 ) , H 3 : ( 1 ) H_2:(5,1),W_3:(5,1),b_3:(1),H_3:(1) H2:(5,1),W3:(5,1),b3:(1),H3:(1)
然后我们得出一个标量值,这个值根据我们设定的阈值,比如如果大于0.5,我们把它归为1,反之为0,就完成了我们的分类,也代表我们整个前向传播的过程结束了

def initialize_parameters(layer_dims):
    # 初始化权重和偏置项
    Weight = {}
    bias = {}
    for d in range(1,len(layer_dims)):
        Weight['W'+str(d)] = np.random.randn(layer_dims[d],layer_dims[d-1]) / np.sqrt(layer_dims[d-1])
        bias['b'+str(d)] = np.zeros((layer_dims[d],1))

    return Weight,bias

# 设定每一层的 hidden_size
layer_dims = [2,20,5,1]  # 两个隐藏层的大小 20 5
W,b = initialize_parameters(layer_dims)
W["W3"].shape
# 输出w3的维度 (1,5)

def forward_propagation(X,Weight,bias,activation):
    # 前向传播函数
    Z = {}
    H = {}
    # 一般吧输入X看做第0层的输出 H0
    H['HO'] = X
    L = len(Weight)
    for l in range(1,L=1):
        Z['Z'+str(l)] = np.dot(Weight['W'+str(l)],H['H'+str(l-1)] + bias['b'+str(l)])
        exec("H['H'+str(l)] = "+activation[l-1]+"(Z['Z'+str(l)])")

    return H,Z

损失函数选用

选用的是交叉熵损失函数(cross entropy ,CE)
L ( y ^ ( i ) , y ( i ) ) = − ( y ( i ) log ⁡ ( y ^ ( i ) ) + ( 1 − y ( i ) ) log ⁡ ( 1 − y ^ ( i ) ) ) L\left(\hat{y}^{(i)}, y^{(i)}\right)=-\left(y^{(i)} \log \left(\hat{y}^{(i)}\right)+\left(1-{y}^{(i)}\right) \log \left(1-\hat{y}^{(i)}\right)\right) L(y^(i),y(i))=(y(i)log(y^(i))+(1y(i))log(1y^(i)))
对整个样本来说,我们的损失函数可以这么定义:
J ( w , b ) = 1 m ∑ i = 1 m L ( y ^ ( i ) , y ( i ) ) = − 1 m ∑ i = 1 m [ y ( i ) log ⁡ ( y ^ ( i ) ) + ( 1 − y ( i ) ) log ⁡ ( 1 − y ^ ( i ) ) ] + λ 2 m ∥ w ∥ F 2 J(w, b)=\frac{1}{m} \sum_{i=1}^{m} L\left(\hat{y}^{(i)}, y^{(i)}\right)=-\frac{1}{m} \sum_{i=1}^{m}\left[y^{(i)} \log \left(\hat{y}^{(i)}\right)+\left(1-y^{(i)}\right) \log \left(1-\hat{y}^{(i)}\right)\right]+\frac{\lambda}{2 m}\|w\|_{F}^{2} J(w,b)=m1i=1mL(y^(i),y(i))=m1i=1m[y(i)log(y^(i))+(1y(i))log(1y^(i))]+2mλwF2
我们在损失函数的最后加了一个L2正则式,进行一个权重的衰减,防止过拟合

import numpy as np
def compute_cost(H,Y,Weight,lambd = 0.7):
    m = Y.shape[1]
    L2_term = 0
    for key in Weight.keys():
        L2_term += (np.sum(np.square(Weight[key])))

    logprobs = np.multiarray(-np.log(H['H'+str(len(H)-1)]),Y)+np.multiarray(-np.log(1-H['H'+str(len(H)-1)]),1-Y)
    # 在训练的时候我们是一批数据进行迭代,最后取其总和的平均值
    cost = 1./m*np.nansum(logprobs)
    cost += L2_term * lambd / (2*m)
    return cost

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值