从头到尾用Python实现一个深度神经网络

最新推荐文章于 2024-06-18 09:16:39 发布
tzyhpcom
最新推荐文章于 2024-06-18 09:16:39 发布
阅读量906
点赞数 1
本文链接：https://blog.csdn.net/tzyhpcom/article/details/78013541
版权

from sklearn.datasets import make_classification
from sklearn import preprocessing
import numpy as np
import math
from matplotlib import pyplot as plt
from copy import deepcopy



def ReLu(X):
    return X*(X>0)

def dReLu(X):
    return 1.*(X>0)

def Sigmod(X):
    return 1.0/(1.0+np.exp(-X))

def dSigmod(X):
    return Sigmod(X)*(1-Sigmod(X))

X, Y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                             n_clusters_per_class=1, n_samples=6000)


X_train = X[0:int(X.shape[0]*0.7),:]
Y_train = Y[0:int(X.shape[0]*0.7)]
X_test = X[int(X.shape[0]*0.7):,:]
Y_test = Y[int(X.shape[0]*0.7):]

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train = X_train.T  # n*m 特征数*样本数
Y_train = Y_train.reshape(1,Y_train.shape[0])
X_test = X_test.T  # n*m
Y_test = Y_test.reshape(1,Y_test.shape[0])

X = X.T


# 构造神经网络
# superparam
m = X_train.shape[1]

n = [X_train.shape[0],8,4,2,1]  # 各层神经元数量, 包含第一层输入层
f = [0, ReLu, ReLu, ReLu, Sigmod]  # 各隐层激活函数, 第一个0为占位
df = [0, dReLu, dReLu, dReLu, dSigmod]  # 各隐层激活函数求导, 第一个0为占位
# n = [X_train.shape[0],1]  # 各层神经元数量, 包含第一层输入层
# f = [0, Sigmod]  # 各隐层激活函数, 第一个0为占位
# df = [0, dSigmod]  # 各隐层激活函数求导, 第一个0为占位


layers = len(n)-1
# param
W = [0 for i in range(len(n))]  # 第一个0补位，为了让W[i]对应第i层参数
b = [0 for i in range(len(n))]
for l in range(1, len(n)):
    W[l] = np.random.randn(n[l],n[l-1])*np.sqrt(2.0/n[l-1])  # 为了避免梯度消失和梯度爆炸问题*np.sqrt(1.0/n[l-1]），对于ReLu2.0更好
    b[l] = np.random.randn(n[l],1)

#构造中间值
Z = [0 for i in range(len(n))]
A = [0 for i in range(len(n))]
dZ = [0 for i in range(len(n))]
dA = [0 for i in range(len(n))]
dW = [0 for i in range(len(n))]
db = [0 for i in range(len(n))]


# 迭代
# super param
rate = 0.01
iteration = 5000
lambd = 0.01


# for graph
loss_train = []
loss_test = []
accuracy_train = 0
accuracy_test = 0

# if debug, grad check
debug = False
epsilon  = 0.00001

for i in range(iteration):
    i += 1
    # forward
    Z[0] = X_train
    A[0] = X_train
    for l in range(1, len(n)):
        Z[l] = np.dot(W[l], A[l-1]) + b[l]
        A[l] = f[l](Z[l])  
    assert(A[layers].shape == (n[layers],m))
    assert(True not in (A[layers]<0)[:])
    l2_norm = sum([np.sum(w**2) for w in W])*lambd/(2.0*m)
    J_train = -(np.dot(np.log(A[layers]),Y_train.T)+np.dot(np.log(1-A[layers]),(1-Y_train).T))/m + l2_norm  # add l2_norm , it only affect dW[l]
    # predict train
    Y_pred = 1*(A[layers]>0.5)
    accuracy_train = (Y_pred == Y_train).mean()
#     print(J)

    # backward
    dA[layers] = -Y_train/A[layers] + (1-Y_train)/(1-A[layers])  # end layer
    for l in range(len(n)-1, 0, -1):
        dZ[l] = dA[l]*df[l](Z[l])  # after calcute, it is  dZ2 = A2-Y_train
        assert(dZ[l].shape == Z[l].shape)
        dW[l] = np.dot(dZ[l], A[l-1].T)/m +lambd*W[l]/m
        assert(dW[l].shape == W[l].shape)
        db[l] = np.sum(dZ[l], axis=1, keepdims=True)/m
        assert(db[l].shape == b[l].shape)
        dA[l-1] = np.dot(W[l].T,dZ[l])

    # grad check 
    if debug:
        W_big = deepcopy(W)
        b_big = deepcopy(b)
        W_small = deepcopy(W)
        b_small = deepcopy(b)
        dW_diff = deepcopy(W)
        db_diff = deepcopy(b)
        Z_big = deepcopy(Z)
        A_big = deepcopy(A)
        Z_small = deepcopy(Z)
        A_small = deepcopy(A)
        # flatten to vector
        theta = np.array([])
        dtheta = np.array([])  # store dW db for check
        for l in range(1, len(n)):
            theta = np.concatenate([theta,W[l].flatten()])
            theta = np.concatenate([theta,b[l].flatten()])
            dtheta = np.concatenate([dtheta,dW[l].flatten()])
            dtheta = np.concatenate([dtheta,db[l].flatten()])
        # calculate every theta
        dtheta_debug = np.zeros(dtheta.shape)
        for t in range(len(theta)):
            # add or minus a little bit
            theta_big = theta.copy()
            theta_small = theta.copy()
            theta_big[t] = theta[t] + epsilon 
            theta_small[t] = theta[t] - epsilon 
            node_cnt = 0
            # resore big and mall of W b
            for l in range(1, len(n)):
                W_big[l] = theta_big[node_cnt:node_cnt+n[l]*n[l-1]].reshape((n[l],n[l-1]))
                W_small[l] = theta_small[node_cnt:node_cnt+n[l]*n[l-1]].reshape((n[l],n[l-1]))
                node_cnt = node_cnt+n[l]*n[l-1]
                b_big[l] = theta_big[node_cnt:node_cnt+n[l]*1].reshape((n[l],1))
                b_small[l] = theta_small[node_cnt:node_cnt+n[l]*1].reshape((n[l],1))
                node_cnt = node_cnt+n[l]*1
            # forward
            Z_big[0] = X_train
            A_big[0] = X_train
            Z_small[0] = X_train
            A_small[0] = X_train
            for l in range(1, len(n)):
                Z_big[l] = np.dot(W_big[l], A_big[l-1]) + b_big[l]
                A_big[l] = f[l](Z_big[l])
                Z_small[l] = np.dot(W_small[l], A_small[l-1]) + b_small[l]
                A_small[l] = f[l](Z_small[l])
            l2_norm_big = sum([np.sum(w**2) for w in W_big])*lambd/(2.0*m)
            J_train_big = -(np.dot(np.log(A_big[layers]),Y_train.T)+np.dot(np.log(1-A_big[layers]),(1-Y_train).T))/m + l2_norm_big
            l2_norm_small = sum([np.sum(w**2) for w in W_small])*lambd/(2.0*m)
            J_train_small = -(np.dot(np.log(A_small[layers]),Y_train.T)+np.dot(np.log(1-A_small[layers]),(1-Y_train).T))/m + l2_norm_small
            dtheta_debug[t] = (J_train_big-J_train_small)/(2.0*epsilon )
        d_diff = dtheta - dtheta_debug
        node_cnt = 0
        # restore to dw and db
        for l in range(1, len(n)):
            dW_diff[l] = d_diff[node_cnt:node_cnt+n[l]*n[l-1]].reshape((n[l],n[l-1]))
            node_cnt = node_cnt+n[l]*n[l-1]
            db_diff[l] = d_diff[node_cnt:node_cnt+n[l]*1].reshape((n[l],1))
            node_cnt = node_cnt+n[l]*1
        grad_diff = np.sqrt(np.sum((dtheta-dtheta_debug)**2))/(np.sqrt(np.sum(dtheta**2))+np.sqrt(np.sum(dtheta_debug**2)))
#         print("dtheta diff: %f" % grad_diff)

    # gradient
    for l in range(len(n)-1, 0, -1):             
        W[l] -= rate*dW[l]
        b[l] -= rate*db[l]

#     print("Iteration %d Loss: %lf" % (i, J))

    # predict    
    A_tmp = X_test
    for l in range(1, len(n)):
        Z_tmp = np.dot(W[l], A_tmp) + b[l]
        A_tmp = f[l](Z_tmp)
    J_test = -(np.dot(np.log(A_tmp),Y_test.T)+np.dot(np.log(1-A_tmp),(1-Y_test).T))/X_test.shape[1]
    Y_pred = 1*(A_tmp>0.5)
    accuracy_test = (Y_pred == Y_test).mean()

    # save loss
    loss_train.append(J_train[0][0])
    loss_test.append(J_test[0][0])

# final accuracy
print("accuracy_train: %lf" % accuracy_train)
print("accuracy_test: %lf" % accuracy_test)



plt.figure(num=0, figsize=(6, 8), dpi=80, facecolor='w', edgecolor='k')
plt.plot(range(iteration), loss_train, c="blue")
plt.plot(range(iteration), loss_test, c="red")
plt.show()


plt.figure(num=None, figsize=(6, 8), dpi=80, facecolor='w', edgecolor='k')
plt.scatter(X[0], X[1], marker='o', c=Y, s=5, edgecolor='k')
plt.show()