《卷积神经网络的Python实现》笔记3

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档


神经网络的代码实现

采用模拟随机生成数据,进行神经网络训练,把前面的相关知识整合在一起,包括数据预处理、神经网络模型、梯度反向传播、梯度检查、检测训练过程、超参数随机搜索等


一、生成数据

生成一组数据

num_samp_per_class = 200 #每种类别200个样例
dim = 2 #深度为2,即2维特征
N_class = 4 #四种类别

def gen_toy_data(dim, N_class, num_samp_per_class):
    num_examples = num_samp_per_class*N_class
    X = np.zeros((num_examples, dim))
    labels = np.zeros(num_examples,dtype= 'uint8')
    for j in range(N_class):
        ix = range(num_samp_per_class*j,num_samp_per_class*(j+1))
        x = np.linspace(-np.pi, np.pi, num_samp_per_class) + 5
        y = np.sin(x + j*np.pi/(0.5*N_class))
        y += 0.2*np.sin(10*x +j*np.pi/(0.5*N_class))
        y += 0.25*x + 10#在非线性的基础上加一个线性增长
        y += np.random.randn(num_samp_per_class)*0.1#增加噪声

        X[ix] = np.c_[x,y]#矩阵左右相加
        labels[ix] = j#标签

    return (X,labels)

可视化数据

import matplotlib.pyplot as plt

def show_data(X,labels):
    plt.scatter(X[:,0],X[:,1],c=labels,s=40,cmap=plt.cm.Spectral)
    plt.show()

二、 数据预处理

中心化和归一化

def normalize(X):#中心化和归一化
    mean = np.mean(X,axis = 0)#取均值
    X_norm = X - mean #中心化
    std = np.std(X_norm ,axis= 0)#计算标准差
    X_norm /= std+ 10**(-5)#规范化
    return (X_norm,mean,std)

PCA和白化

def PCA_white(X):#PCA和白化操作
    mean = np.mean(X, axis=0)
    X_xorn = X - mean
    cov = np.dot(X_xorn.T , X_xorn)/X_xorn.shape[0]
    U,S,V = np.linalg.svd(cov)
    X_xorn = np.dot(X_xorn, U)#奇异值分解
    X_xorn /= np.sqrt(S+ 10**(-5))#白化操作,几何意义为每个属性的方差相同,后面加10**(-5)是加一个小常数,防止存在为0的方差
    return (X_xorn,mean,U,S)

对随机切分数据

def split_data(X,labels):#按照2:1:1分为训练集,验证集和测试集
    num_examples = X.shape[0]
    shuffle_no = list(range(num_examples))
    np.random.shuffle(shuffle_no)#打乱顺序,类似于洗牌

    X_train = X[shuffle_no[:num_examples//2]]
    labels_train = labels[shuffle_no[:num_examples//2]]

    X_val = X[shuffle_no[num_examples//2:num_examples//2+num_examples//4]]
    label_val = labels[shuffle_no[num_examples//2:num_examples//2+num_examples//4]]

    X_test = X[shuffle_no[-num_examples//4:]]
    labels_test = labels[shuffle_no[-num_examples//4:]]

    return (X_train,labels_train,X_val,label_val,X_test,labels_test)

预处理数据

def data_preprocess(X_train,X_val,X_test):#使用训练集的值对测试集和验证集进行PCA和白化
    (X_train_pca,mean,U,S) = PCA_white(X_train)

    X_val_pca = np.dot(X_val-mean,U)
    X_val_pca /= np.sqrt(S + 10**(-5))

    X_test_pca = np.dot(X_test-mean,U)
    X_test_pca /= np.sqrt(S+10**(-5))
    return (X_train_pca,X_val_pca,X_test_pca)

三、网络模型

初始化权重

def initialize_parameters(laryer_param):#权重初始化
    weights = []
    biases = []
    vweight = []
    vbiases = []

    for i in range(len(laryer_param) - 1):
        in_depth = laryer_param[i]
        out_depth = laryer_param[i+1]
        std = np.sqrt(2/in_depth)*0.5#乘以0.5,使初始数据损失接近-log(1/N_class)
        weights.append(std*np.random.randn(in_depth,out_depth))
        biases.append(np.zeros((1,out_depth)))
        vweight.append(np.zeros((in_depth,out_depth)))
        vbiases.append(np.zeros((1,out_depth)))
    return (weights, biases , vweight,vbiases)

前向计算算法

def forward(X,layer_param,weights,biases):#模型前向计算代码
    hiddens = []
    hiddens.append(X)

    for i in range(len(layer_param)-2):
        hiddens.append(np.maximum(0,np.dot(hiddens[i],weights[i])+biases[i]))
    scores = np.dot(hiddens[-1],weights[-1]) +biases[-1]

    return (hiddens,scores)

softmax损失函数

def data_loss_softmax(scores, labels):#计算softmax损失函数
    num_examples = scores.shape[0]
    exp_scores = np.exp(scores)
    exp_cores_num = np.sum(exp_scores ,axis=1)
    corect_probs = exp_scores[range(num_examples),labels]/exp_cores_num
    corect_logprobs = -np.log(corect_probs)
    data_loss = np.sum(corect_logprobs)/num_examples
    return data_loss

L2范数损失

def reg_L2_loss(weights,reg):#计算L2范数损失
    reg_loss = 0
    for weight in weights:
        reg_loss += 0.5*reg*np.sum(weight*weight)
    return reg_loss

计算分值矩阵梯度

def dscores_softmax(scores,labels):#计算分值矩阵梯度
    num_examples = scores.shape[0]
    exp_scores = np.exp(scores)
    probs = exp_scores/np.sum(exp_scores ,axis= 1,keepdims=True)
    dscores = probs
    dscores[range(num_examples),labels] -= 1
    dscores /= num_examples
    return dscores

准确率预测

def predict(X, labels, layer_param ,weights ,biases):#准确率预测,predict函数和前向函数forward几乎一致,只是不需要保存hidden神经元
    hidden = X
    for i in range(len(layer_param)-2):
        hidden = np.maximum(0, np.dot(hidden,weights[i])+biases[i])
    scores = np.dot(hidden,weights[-1]) +biases[-1]
    predicted_class = np.argmax(scores ,axis=1)
    right_class = predicted_class == labels
    return np.mean(right_class)

梯度反向传播算法

def gradient_backprop(dscores, hidden, weights, biases, reg):#梯度反向传播算法
    dweights = []
    dbiases = []
    dhidden = dscores
    for i in range(len(hidden)-1, -1, -1):
        dweights.append(np.dot(hidden[i].T, dhidden)+ reg*weights[i])
        dbiases.append(np.sum(dhidden, axis = 0,keepdims=True))
        dhidden = np.dot(dhidden, weights[i].T)
        dhidden[hidden[i]<=0] = 0
    return (dweights,dbiases)

四、 梯度检查


def gen_random_data(dim, N_class, num_samp_per_class):#梯度检查
    num_example = num_samp_per_class*N_class
    X = np.random.randn(num_example, dim)
    labels = np.random.randint(N_class, size=num_example)
    return (X,labels)
def check_gradient(X, labels, layer_param, check_weight_or_bias):
    #(X, labels) = gen_random_data(dim,N_class,num_samp_per_class=200)
    #layer_param = [dim,N_class]
    #layer_param = [dim,10,20,N_class]
    #check_weight_or_bias: 1 for weight, 0 for bias

    (weights, biases, vweights ,vbiases)= initialize_parameters(layer_param)
    reg = 10**(-9)
    step = 10**(-5)
    for layer in range(len(weights)):
        if check_weight_or_bias:
            row= np.random.randint(weights[layer].shape[0])
            col= np.random.randint(weights[layer].shape[1])
            param= weights[layer][row][col]
        else:
            row= np.random.randint(biases[layer].shape[1])
            param = biases[layer][0][row]
        (hiddens, scores) = forward(X,layer_param,weights,biases)
        dscores= dscores_softmax(scores,labels)
        (dweights,dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
        if check_weight_or_bias:
            danalytic = dweights[-1-layer][row][col]
        else:
            danalytic = dbiases[-1-layer][0][row]

        if check_weight_or_bias:
            weights[layer][row][col] = param-step
        else:
            biases[layer][0][row]=param-step

        (hiddens,scores) = forward(X,layer_param,weights,biases)
        data_loss1 =data_loss_softmax(scores, labels)
        reg_loss1 = reg_L2_loss(weights, reg)
        loss1 = data_loss1 +reg_loss1

        if check_weight_or_bias:
            weights[layer][row][col] = param+step
        else:
            biases[layer][0][row] = param+step
        (hiddens,scoeres) = forward(X,layer_param,weights,biases)
        data_loss2 = data_loss_softmax(scores,labels)
        reg_loss2 = reg_L2_loss(weights,reg)
        loss2 = data_loss2+reg_loss2

        dnumric = (loss2-loss1)/(2*step)
        print(layer,data_loss1,data_loss2)

        error_relative= np.abs(danalytic-dnumric)/np.maximum(danalytic,dnumric)

        print(danalytic,dnumric,error_relative)

五、参数优化

def nesterov_momentumSGD(vparams, params, dparams, lr, mu):#优化参数
    updata_radio= []
    for i in range(len(params)):
        pre_vparam = vparams[i]
        vparams[i] = mu*vparams[i] - lr*dparams[-1-i]
        updata_param = vparams[i] +mu*(vparams[i]-pre_vparam)
        params[i] += updata_param
        updata_radio.append(np.sum(np.abs(updata_param))/np.sum(np.abs(params[i])))
    return updata_radio

六、训练网络

def train_net(X_train, labels_train, layer_param, lr, lr_decay, reg, mu, max_epoch, X_val, labels_val):
    (weights, biases, vweights, vbiases) = initialize_parameters(layer_param)# 1初始化参数
    epoch = 0
    data_losses = []
    reg_losses = []

    val_accuracy = []
    train_accuracy = []
    weights_update_ratio = []
    baises_update_ratio = []
    while epoch < max_epoch:
        (hiddens, scores) = forward(X_train, layer_param, weights, biases)  # 2前向计算得到分值矩阵和隐含层激活函数

        val_accuracy.append(predict(X_val, labels_val, layer_param, weights, biases))  # 3计算训练集和验证集的准确率
        train_accuracy.append(predict(X_train, labels_train, layer_param, weights, biases))

        data_loss = data_loss_softmax(scores, labels_train)  # 4计算数据损失和正则化损失
        reg_loss = reg_L2_loss(weights, reg)
        dscores = dscores_softmax(scores, labels_train)  # 5开始梯度反向传播,先计算分值矩阵的梯度
        (dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)  # 6然后继续进行反向传播
        weights_update_ratio.append(nesterov_momentumSGD(vweights, weights, dweights, lr, mu))  # 7计算权重和偏置更新率
        baises_update_ratio.append(nesterov_momentumSGD(vbiases, biases, dbiases, lr, mu))
        data_losses.append(data_loss)
        reg_losses.append(reg_loss)
        epoch += 1
        lr *= lr_decay  # 8进行学习率指数退火

    # 可视化数据损失、训练集和验证集准确率
    plt.close()
    fig = plt.figure('loss')
    ax = fig.add_subplot(2, 1, 1)
    ax.grid(True)
    ax2 = fig.add_subplot(2, 1, 2)
    ax2.grid(True)
    plt.xlabel('log10(lr)=' + str(round((np.log10(lr)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
               fontsize=14)
    plt.ylabel('                              accuracy       log10(data loss)', fontsize=14)

    ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b', marker='.')
    #    ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')

    ax2.scatter(np.arange(len(val_accuracy) - 0), val_accuracy[0:], c='r', marker='*')
    ax2.scatter(np.arange(len(val_accuracy) - 0), train_accuracy[0:], c='g', marker='.')
    #    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
    #    ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
    plt.show()

    # %% 对数显示每层权重和偏置的更新率,合理值在10**(-3)
    for layer in range(len(weights)):
        wur = []
        for i in range(len(weights_update_ratio)):
            wur.append(weights_update_ratio[i][layer])

        bur = []
        for i in range(len(baises_update_ratio)):
            bur.append(baises_update_ratio[i][layer])

        plt.close()
        fig = plt.figure('update ratio')
        ax = fig.add_subplot(2, 1, 1)
        ax.grid(True)
        ax2 = fig.add_subplot(2, 1, 2)
        ax2.grid(True)
        plt.xlabel('log10(lr)=' + str(round((np.log10(lr)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
                   fontsize=14)
        ax.scatter(np.arange(len(wur)), np.log10(wur), c='b', marker='.')
        ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r', marker='*')
        plt.show()

    return (data_losses, reg_losses, weights, biases, val_accuracy)

七、过拟合小数据集处理

def overfit_tinydata(X, labels, layer_param, lr=10 ** (-0.0), lr_decay=1, mu=0.9, reg=0, max_epoch=100):#过拟合,reg=0
    #    (X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=2)
    #    X,_,_,_ = PCA_white(X)
    #    layer_param = [dim, 100, 100, N_class]
    (data_losses, reg_losses, weights, biases, accuracy) = train_net(X, labels, layer_param, lr, lr_decay, reg, mu,max_epoch, X, labels)

    return (data_losses, reg_losses, accuracy)
    # data_loss = 4.223167361579445e-05

八、 超参数随机搜索

#超参数随机搜索,收敛速度快,不需要退火算法,Ir_decay=1
def hyperparam_random_search(X_train, labels_train, X_val, labels_val, layer_param, num_try=10, lr=[-1, -5],
                             lr_decay=0.997, mu=0.9, reg=[-2.0, -5.0], max_epoch=500):
    #    (X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=200)
    #    layer_param = [dim, 100, 100, N_class]

    minlr = min(lr)
    maxlr = max(lr)
    randn = np.random.rand(num_try * 2)
    lr_array = 10 ** (minlr + (maxlr - minlr) * randn[0: num_try])
    minreg = min(reg)
    maxreg = max(reg)
    reg_array = 10 ** (minreg + (maxreg - minreg) * randn[num_try: 2 * num_try])
    lr_regs = zip(lr_array, reg_array)

    for lr_reg in lr_regs:
        (data_loss, reg_loss, weights, biases, val_accuracy) = train_net(X_train, labels_train, layer_param, lr_reg[0],
                                                                         lr_decay, lr_reg[1], mu, max_epoch, X_val,
                                                                         labels_val)

    return (weights, biases)

九、 程序组织结构

if __name__ == '__main__':
    # %%
    dim = 2  # dimensionality
    N_class = 4  # number of classes

    # %%

    layer_param = [dim, 10, 20, N_class]
    (X, labels) = gen_random_data(dim, N_class, num_samp_per_class=20)
    for i in range(2):
        check_gradient(X, labels, layer_param, 1)

#    #%%
    layer_param = [dim, 100, 100, N_class]
    (X, labels) = gen_toy_data(dim, N_class, num_samp_per_class=2)
    X, _, _, _ = PCA_white(X)
    (data_losses, reg_losses, accuracy) = overfit_tinydata(X, labels, layer_param, lr=10 ** (-0.5), lr_decay=1, mu=0.9,reg=10 ** (-10), max_epoch=100)
#    #%%
    layer_param = [dim, 100, 100, N_class]
    (X, labels) = gen_toy_data(dim, N_class, num_samp_per_class=200)
    (X_train, labels_train, X_val, labels_val, X_test, labels_test) = split_data(X, labels)
    (X_train_pca, X_val_pca, X_test_pca) = data_preprocess(X_train, X_val, X_test)
    (weights, biases) = hyperparam_random_search(X_train_pca, labels_train, X_val_pca, labels_val, layer_param,num_try=2, lr=[-1, -2.1], lr_decay=1, mu=0.9, reg=[-2, -5],max_epoch=10000)

程序主要由3部分组成的:数据预处理函数、网络模型函数、模型使用函数。
可分别将上述的几部分分为三个py文件,第一,二部分为data_processes.py,第九部分为train.py,其余部分为nn.py,其中train.py中要导入data_processes和nn两个模块。

至此,不带BN层的NN网络就实现完成了。

十、声明

在这里声明一下,本人blog为学习笔记(《卷积神经网络的Python实现》单建华,人民邮电出版社),代码不是原创,只是在源代码中改了一些BUG和加了一些便于理解的注释和大家分享自己的理解和心得。如果需要原版资源,可以去https://www.ituring.com.cn/book/2661下载。昨天看到自己的blog下有一些表扬的话,在虚荣之余还是有些受之有愧。

再者,单建华老师的《卷积神经网路的Python实现》真的是一本十分便于新手理解神经网络和卷积神经网络的一本书,推荐大家图书馆借阅或购买。

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值