神经网络实例Python

      本文采用一个模拟数据集进行神经网络的训练,相关知识点包括数据预处理、BN层、神经网络模型、梯度反向传播、梯度检查、监测训练过程、超参数随机搜索等,使读者掌握一个完整的机器学习流程。

1、生成数据

     生成一个线性不可分的数据集,它是一个随时间增长的振荡数据。

import numpy as np
import matplotlib.pyplot as plt

# 显示数据
def show_data(X, labels):
    plt.scatter(X[:, 0], X[:, 1], c=labels, s=10, cmap=plt.cm.Spectral)
    plt.show()

def gen_random_data(dim, N_class, num_samp_per_class):
    num_examples = num_samp_per_class*N_class
    X = np.random.randn(num_examples, dim) # data matrix (each row = single example)
    labels = np.random.randint(N_class, size = num_examples) # class labels
    show_data(X, labels)
    return (X, labels)

dim = 2  # dimensionality
N_class = 4  # number of classes
layer_param = [dim, 10, 20, N_class]
(X, labels) = gen_random_data(dim, N_class, num_samp_per_class=20)

# 生成数据
def gen_toy_data(dim, N_class, num_samp_per_class):
    num_examples = num_samp_per_class * N_class
    X = np.zeros((num_examples, dim))  # data matrix
    labels = np.zeros(num_examples, dtype='uint8')  # class labels
    for j in range(N_class):
        ix = range(num_samp_per_class * j, num_samp_per_class * (j + 1))
        x = np.linspace(-np.pi, np.pi, num_samp_per_class) + 5  # x axis
        y = np.sin(x + j * np.pi / (0.5 * N_class))
        y += 0.2 * np.sin(10 * x + j * np.pi / (0.5 * N_class))
        y += 0.25 * x + 10
        y += np.random.randn(num_samp_per_class) * 0.1  # noise

        X[ix] = np.c_[x, y]
        labels[ix] = j

    show_data(X, labels)
    return (X, labels)

2、数据预处理

中心化和归一化

def normalize(X):
    # (x-u)/delta
    mean = np.mean(X, axis=0)
    X_norm = X - mean
    std = np.std(X_norm, axis=0)
    X_norm /= std + 10**(-5)
    return (X_norm, mean, std)

PCA白化

def PCA_white(X):
    mean = np.mean(X, axis=0)
    X_norm = X - mean
    cov = np.dot(X_norm.T, X_norm)/X_norm.shape[0]
    U,S,V = np.linalg.svd(cov)
    X_norm = np.dot(X_norm, U)
    X_norm /= np.sqrt(S + 10**(-5))
    return (X_norm, mean, U, S)

数据集按比例2:1:1随机分为训练集、验证集和测试集

def split_data(X, labels):
    split1 = 2
    split2 = 4
    num_examples = X.shape[0]
    shuffle_no = list(range(num_examples))
    np.random.shuffle(shuffle_no)

    X_train = X[shuffle_no[:num_examples//split1]]
    labels_train = labels[shuffle_no[:num_examples//split1]]
    X_val = X[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]]
    labels_val = labels[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]]
    X_test = X[shuffle_no[-num_examples//4:]]
    labels_test = labels[shuffle_no[-num_examples//4:]]
    return (X_train, labels_train, X_val, labels_val, X_test, labels_test)

进行预处理

def data_preprocess(X_train, X_val, X_test):
    (X_train_pca, mean, U, S) = PCA_white(X_train)
    X_val_pca = np.dot(X_val - mean, U)
    X_val_pca /= np.sqrt(S + 10**(-5))
    X_test_pca = np.dot(X_test - mean, U)
    X_test_pca /= np.sqrt(S + 10**(-5))
    return (X_train_pca, X_val_pca, X_test_pca)

3、网络模型

超参数主要是网络深度(隐含层的数量)和每层神经元的数量。可以采用list结构存储每层神经元数量,包括输入层和输出层。

初始化

# layer_param = [dim, 100, 100, N_class]
def initialize_parameters(layer_param):
    weights = []
    biases = []
    vweights = []
    vbiases = []

    for i in range(len(layer_param) - 1):
        in_depth = layer_param[i]
        out_depth = layer_param[i+1]
        std = np.sqrt(2/in_depth)*0.5
        weights.append(std*np.random.randn(in_depth, out_depth))
        biases.append((np.zeros((1, out_depth))))
        vweights.append(np.zeros((in_depth, out_depth)))
        vbiases.append(np.zeros((1, out_depth)))
    return (weights, biases, vweights, vbiases)

前向计算

def forward(X, layer_param, weights, biases):
    hiddens = []
    hiddens.append(X)
    for i in range(len(layer_param)-2):
        hiddens.append(np.maximum(0, np.dot(hiddens[i], weights[i]) + biases[i]))
    # 最后一层不需要非线性激活函数
    scores = np.dot(hiddens[-1], weights[-1]) + biases[-1]
    return (hiddens, scores)

计算softmax数据损失值

def data_loss_softmax(scores, labels):
    num_examples = scores.shape[0]
    exp_scores = np.exp(scores)
    exp_cores_sum = np.sum(exp_scores, axis=1)
    corect_probs = exp_scores[range(num_examples), labels]/exp_cores_sum
    corect_logprobs = -np.log(corect_probs)
    data_loss = np.sum(corect_logprobs)/num_examples
    return data_loss

计算L2范数损失

def reg_L2_loss(weights, reg):
    reg_loss = 0
    for weight in weights:
        reg_loss += 0.5*reg*np.sum(weight*weight)
    return reg_loss

计算分值矩阵梯度

def dscores_softmax(scores, labels):
    num_examples = scores.shape[0]
    exp_scores = np.exp(scores)
    probs = exp_scores/np.sum(exp_scores, axis=1, keepdims=True)
    dscores = probs
    dscores[range(num_examples), labels] -= 1
    dscores /= num_examples
    return dscores

准确率预测,和前向函数几乎一致,只是不需要保存hidden神经元

def predict(X, labels, layer_param, weights, biases):
    hidden = X
    for i in range(len(layer_param)-2):
        hidden = np.maximum(0, np.dot(hidden, weights[i]) + biases[i])
    scores = np.dot(hidden, weights[-1]) + biases[-1]
    predicted_class = np.argmax(scores, axis=1)
    right_class = predicted_class == labels
    return np.mean(right_class)

梯度反向传播算法

def gradient_backprop(dscores, hiddens, weights, biases, reg):
    dweights = []
    dbiases = []
    dhidden = dscores
    for i in range(len(hiddens)-1, -1, -1):
        dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i])
        dbiases.append(np.sum(dhidden, axis=0, keepdims=True))
        dhidden = np.dot(dhidden, weights[i].T)
        dhidden[hiddens[i]<=0] = 0
    return (dweights, dbiases)

4、梯度检查

对每层的权重和偏置进行梯度检查

def check_gradient(X, labels, layer_param, check_weight_or_bias):
    (X, labels) = gen_random_data(dim, N_class, num_samp_per_class=200)
    (weights, biases, vweights, vbiases) = initialize_parameters(layer_param)
    reg = 10**(-9)
    step = 10**(-5)
    for layer in range(len(weights)):
        if check_weight_or_bias:
            row = np.random.randint(weights[layer].shape[0])
            col = np.random.randint(weights[layer].shape[1])
            param = weights[layer][row][col]
        else:
            row = np.random.randint(weights[layer].shape[1])
            param = biases[layer][0][row]
        # 前向计算
        (hiddens, scores) = forward(X, layer_param, weights, biases)
        dscores = dscores_softmax(scores, labels)
        # 反向计算
        (dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
        if check_weight_or_bias:
            danalytic = dweights[-1-layer][row][col]
        else:
            danalytic = dbiases[-1-layer][0][row]
        if check_weight_or_bias:
            weights[layer][row][col] = param - step
        else:
            biases[layer][0][row] = param - step
        (hiddens, scores) = forward(X, layer_param, weights, biases)
        data_loss1 = data_loss_softmax(scores, labels)
        reg_loss1 = reg_L2_loss(weights, reg)
        loss1 = data_loss1 + reg_loss1
        if check_weight_or_bias:
            weights[layer][row][col] = param + step
        else:
            biases[layer][0][row] = param + step
        (hiddens, scores) = forward(X, layer_param, weights, biases)
        data_loss2 = data_loss_softmax(scores, labels)
        reg_loss2 = reg_L2_loss(weights, reg)
        loss2 = data_loss2 + reg_loss2
        dnumeric = (loss2 - loss1)/(2*step)

        print(layer, data_loss1, data_loss2)
        error_relative = np.abs(danalytic - dnumeric)/np.maximum(danalytic, dnumeric)
        print(danalytic, dnumeric, error_relative)

5、参数优化

采用Nesterov动量优化方法,同时函数返回了参数更新比例

def nesterov_momentumSGD(vparams, params, dparams, lr, mu):
    update_ratio = []
    for i in range(len(params)):
        pre_vparam = vparams[i]
        vparams[i] = mu*vparams[i] - lr*dparams[-1-i]
        update_param = vparams[i] + mu * (vparams[i] - pre_vparam)
        params[i] += update_param
        update_ratio.append(np.sum(np.abs(update_param))
                            /np.sum(np.abs(params[i])))
    return update_ratio

6、训练网络

初始化参数,前向计算,计算训练集和验证集的准确率,计算数据损失和正则化损失,梯度反向传播,进行学习率指数退化,可视化

def train_net(X_train, labels_train, layer_param, lr,
              lr_decay, reg, mu, max_epoch, X_val, labels_val):
    lr0 = lr
    (weights, biases, vweights, vbiases) = initialize_parameters(layer_param)
    epoch = 0
    data_losses = []
    reg_losses = []
    val_accuracy = []
    train_accuracy = []
    weights_update_ratio = []
    baises_update_ratio = []
    while epoch < max_epoch:
        (hiddens, scores) = forward(X_train, layer_param, weights, biases)
        val_accuracy.append(predict(X_val, labels_val, layer_param, weights, biases))
        train_accuracy.append(predict(X_train, labels_train, layer_param, weights, biases))
        data_loss = data_loss_softmax(scores, labels_train)
        reg_loss = reg_L2_loss(weights, reg)
        dscores = dscores_softmax(scores, labels_train)
        (dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
        weights_update_ratio.append(nesterov_momentumSGD(
            vweights, weights, dweights, lr, mu
        ))
        baises_update_ratio.append(nesterov_momentumSGD(
            vbiases, biases, dbiases, lr, mu
        ))
        data_losses.append(data_loss)
        reg_losses.append(reg_loss)
        epoch += 1
        lr *= lr_decay
    # 可视化数据损失、训练集和验证集准确率
    plt.close()
    fig = plt.figure('loss')
    ax = fig.add_subplot(2, 1, 1)
    ax.grid(True)
    ax2 = fig.add_subplot(2, 1, 2)
    ax2.grid(True)
    plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
               fontsize=14)
    plt.ylabel('                              accuracy       log10(data loss)', fontsize=14)
    ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b', marker='.')
    #    ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')
    ax2.scatter(np.arange(len(val_accuracy) - 0), val_accuracy[0:], c='r', marker='*')
    ax2.scatter(np.arange(len(val_accuracy) - 0), train_accuracy[0:], c='g', marker='.')
    #    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
    #    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
    plt.show()

    # %% 对数显示每层权重和偏置的更新率,合理值在10**(-3)
    for layer in range(len(weights)):
        wur = []
        for i in range(len(weights_update_ratio)):
            wur.append(weights_update_ratio[i][layer])
        bur = []
        for i in range(len(baises_update_ratio)):
            bur.append(baises_update_ratio[i][layer])
        plt.close()
        fig = plt.figure('update ratio')
        ax = fig.add_subplot(2, 1, 1)
        ax.grid(True)
        ax2 = fig.add_subplot(2, 1, 2)
        ax2.grid(True)
        plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
                   fontsize=14)
        ax.scatter(np.arange(len(wur)), np.log10(wur), c='b', marker='.')
        ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r', marker='*')
        plt.show()

    return (data_losses, reg_losses, weights, biases, val_accuracy)

7、过拟合小数据集

def overfit_tinydata(lr=10 ** (-0.0), lr_decay=1, mu=0.9, reg=0, max_epoch=100):
    (X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=2)
    X,_,_,_ = PCA_white(X)
    layer_param = [dim, 100, 100, N_class]
    (data_losses, reg_losses, weights, biases, accuracy) = train_net(X, labels, layer_param, lr, lr_decay, reg, mu,
                                                                     max_epoch, X, labels)

    return (data_losses, reg_losses, accuracy)

8、超参数随机搜索

def hyperparam_random_search(X_train, labels_train,
                             X_val, labels_val,
                             layer_param, num_try=10,
                             lr=[-1, -5], lr_decay=1,
                             mu=0.9, reg=[-2.0, -5.0],
                             max_epoch=500):
    minlr = min(lr)
    maxlr = max(lr)
    randn = np.random.rand(num_try*2)
    lr_array = 10**(minlr + (maxlr-minlr)*randn[0:num_try])
    minreg = min(reg)
    maxreg = max(reg)
    reg_array = 10**(minreg + (maxreg-minreg)*randn[num_try:2*num_try])
    lr_regs = zip(lr_array, reg_array)
    for lr_reg in lr_regs:
        (data_loss, reg_loss, weights, biases, val_accuracy) = train_net(X_train, labels_train, layer_param,
                    lr_reg[0], lr_decay, lr_reg[1], mu, max_epoch,
                    X_val, labels_val)
    return (weights, biases)
dim = 2  # dimensionality
N_class = 4  # number of classes
layer_param = [dim, 100, 100, N_class]
(X,labels) = gen_toy_data(dim, N_class, num_samp_per_class = 200)
(X_train, labels_train, X_val, labels_val, X_test, labels_test) = split_data(X, labels)
(X_train_pca, X_val_pca, X_test_pca) = data_preprocess(X_train, X_val, X_test)
(weights, biases) = hyperparam_random_search(X_train_pca, labels_train, X_val_pca, labels_val, layer_param,
                         num_try = 2, lr=[-0.1, -0.2], lr_decay=1, mu=0.9, reg=[-2, -5], max_epoch=2000)

上面设置lr=[-0.1, -0.2]时,数据损失波动很大且下降很快,参数更新率也过高

修改lr=[-1.5, -2.5],数据损失基本无波动,训练集和验证集的准确率趋于饱和,过拟合现象不明显

9、评估模型

accuracy = predict(X_test_pca, labels_test, layer_param, weights, biases)
print(accuracy) # 0.785

10、增加BN层

    增加BN层,不会改变程序的总体流程。但是需要注意的是,这里不再需要偏置参数,而是增加了标准查和均值参数,同时需要保留BN层的一些中间结果,如均值和方差等。注意,由于采用全部样本进行训练,所以均值和方差不需要进行移动平均。

BN_EPSILON = 10**(-5)

def initialize_parameters_BN(layer_param):
    weights = []  
    gammas = []
    betas = []
    
    vweights = []
    vgammas = []
    vbetas = []
        
    for i in range(len(layer_param) - 1):  
        in_depth = layer_param[i]
        out_depth = layer_param[i+1]
        std = np.sqrt(2/in_depth)
        weights.append( std * np.random.randn(in_depth, out_depth) )  
        gammas.append( np.ones((1, out_depth)) ) 
        betas.append( np.zeros((1, out_depth)) )          
                
        vweights.append( np.zeros( (in_depth, out_depth) ) )
        vgammas.append( np.zeros((1, out_depth)) )
        vbetas.append( np.zeros((1, out_depth)) )         
    
    params = (weights, gammas, betas) 
    vparams = (vweights, vgammas, vbetas) 
    return (params, vparams)

def BN_forward(X, gamma, beta):   
    mu = np.mean(X, axis=0)
    var = np.var(X, axis=0)
    std = np.sqrt(var + BN_EPSILON)
    X_hat = (X - mu)/std
    Y = gamma*X_hat + beta    
    bn_cache = (mu, var, std)

    return (Y, bn_cache, X_hat)

def BN_backprop(dY, X, X_hat, bn_cache, gamma, beta):
    (mu, var, std) = bn_cache
    batch = X.shape[0]
    dX_hat = gamma*dY
    dbeta = np.sum(dY, axis=0, keepdims=True)
    dgamma = np.sum(X_hat*dY, axis=0, keepdims=True)
    
    dX = dX_hat/std
    dmu = -np.sum(dX_hat, axis=0, keepdims=True)/std
    dstd = -1/(std*std) * np.sum((X - mu)*dX_hat, axis=0)
    dvar = 1/2*(var**(-0.5))*dstd
    dX += 2*(X - mu)/batch * dvar
    dX += dmu/batch   
    
    return (dX, dgamma, dbeta)     
        

def forward_BN(X, layer_param, params):
    hiddens = []
    bn_ins = []
    bn_caches = []
    bn_hats = []
    
    (weights, gammas, betas) = params
    hiddens.append(X)
    for i in range(len(layer_param)-1):  
        hidden = np.dot(hiddens[i], weights[i]) # 1
        bn_ins.append(hidden)
        (hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i]) # 2
        bn_hats.append(bn_hat)
        bn_caches.append(bn_cache)    
        if i < len(layer_param)-2:
            hiddens.append(np.maximum(0, hidden)) # 3
        else:
            scores = hidden # 4
    return (hiddens, scores, bn_ins, bn_hats, bn_caches)

def predict_BN(X, labels, layer_param, params):
    (weights, gammas, betas) = params 
    hidden = X
    for i in range(0,len(layer_param)-1):  
        hidden = np.dot(hidden, weights[i])
        (hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i])
        if i < len(layer_param)-2:
            hidden = np.maximum(0, hidden)
        else:
            scores = hidden
    
    predicted_class = np.argmax(scores, axis=1)
    rigth_class = predicted_class == labels         
    return np.mean(rigth_class)

def gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg):
    (weights, gammas, betas) = params   
    dweights = []
    dgammas = []
    dbetas = []
    
    dhidden = dscores    
    for i in range(len(hiddens)-1, -1, -1):
        (dhidden, dgamma, dbeta) = BN_backprop(dhidden, bn_ins[i], bn_hats[i], bn_caches[i], gammas[i], betas[i])
        dgammas.append(dgamma)
        dbetas.append(dbeta)         
        dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i] )
        dhidden = np.dot(dhidden, weights[i].T)
        dhidden[hiddens[i] <= 0] = 0               
    return (dweights, dgammas, dbetas)

#%%
def train_net_BN(data, layer_param, super_params, max_epoch):
    
    (X_train, labels_train, X_val, labels_val) = data
    (lr_w, lr_decay, mu_w, lr_bn, mu_bn, reg) = super_params
    (params, vparams) = initialize_parameters_BN(layer_param)  
    epoch = 0
    
    data_losses = []
    reg_losses = []    
    val_accuracy = []
    train_accuracy = []
    weights_update_ratio = []  
    gammas_update_ratio = []
    betas_update_ratio = []
    
    while epoch < max_epoch:    
        (hiddens, scores, bn_ins, bn_hats, bn_caches) = forward_BN(X_train, layer_param, params) 
        
        val_accuracy.append(predict_BN(X_val, labels_val, layer_param, params)) 
        train_accuracy.append(predict_BN(X_train, labels_train, layer_param, params)) 
        
        data_loss = data_loss_softmax(scores, labels_train)    
        reg_loss = reg_L2_loss(params[0], reg)    
        dscores = dscores_softmax(scores, labels_train)  
        
        (dweights, dgammas, dbetas) = gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg)    
        weights_update_ratio.append(nesterov_momentumSGD(vparams[0], params[0], dweights, lr_w, mu_w))
        gammas_update_ratio.append(nesterov_momentumSGD(vparams[1], params[1], dgammas, lr_bn, mu_bn))
        betas_update_ratio.append(nesterov_momentumSGD(vparams[2], params[2], dbetas, lr_bn, mu_bn))
        
        data_losses.append(data_loss)
        reg_losses.append(reg_loss)        
        epoch += 1
        lr_w *= lr_decay
  
    # 省略可视化结果代码      
    plt.close()
    fig=plt.figure('loss')
    ax=fig.add_subplot(2,1,1)
#    ax.set_title('data loss')
    ax.grid(True)
    ax2=fig.add_subplot(2,1,2)
#    ax2.set_title('reg loss')
    ax2.grid(True)
    plt.xlabel( 'log10(lr_w)=' + str(round((np.log10(lr_w)),2)) + ' ' +  'log10(reg)=' + str(round((np.log10(reg)),2)), fontsize=14)
    plt.ylabel('                              accuracy       log10(data loss)', fontsize=14)     
    ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b',marker='.')
#    ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')
    
#    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
#    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
    ax2.scatter(np.arange(len(val_accuracy)-0), val_accuracy[0:], c='r',marker='*')
    ax2.scatter(np.arange(len(val_accuracy)-0), train_accuracy[0:], c='g',marker='.')
    plt.show() 
    
    #%%
    for layer in range(len(params[0])):
        wur = []
        for i in range(len(weights_update_ratio)):
            wur.append( weights_update_ratio[i][layer] )            
        bur = []
        for i in range(len(betas_update_ratio)):
            bur.append( betas_update_ratio[i][layer] )
        
        plt.close()
        fig=plt.figure('update ratio')
        ax=fig.add_subplot(2,1,1)
        ax.grid(True)
        ax2=fig.add_subplot(2,1,2)
        ax2.grid(True)
        plt.xlabel( 'log10(lr_w)=' + str(round((np.log10(lr_w)),2)) + ' ' +  'log10(reg)=' + str(round((np.log10(reg)),2)), fontsize=14)
#        plt.ylabel('log10(baises_update_ratio) log10(weights_update_ratio)', fontsize=14)     
        ax.scatter(np.arange(len(wur)), np.log10(wur), c='b',marker='.')
        ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r',marker='*')
        plt.show() 

 

 

展开阅读全文

没有更多推荐了,返回首页