吴恩达机器学习第四次作业（python实现）：BP神经网络

最新推荐文章于 2021-09-01 18:10:57 发布

Yue9812

最新推荐文章于 2021-09-01 18:10:57 发布

阅读量3.9k

点赞数 5

文章标签：机器学习 BP神经网络反向传播 python

本文链接：https://blog.csdn.net/zy1337602899/article/details/85309309

版权

BP神经网络

数据放在这：
在这里插入图片描述
先放上整体代码，后面对具体函数有解释

import numpy as np
from scipy.io import loadmat
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import scipy.optimize as opt
import time


def raw_data(path):
    data=loadmat(path)
    return data


def sigmoid(z):
    return 1/(1+np.exp(-z))


# 随机生成100张图片
def random_100_image(x):
    indexs=np.random.choice(x.shape[0],100)
    images=x[indexs]
    # 将幕布分成10x10份，每一份尺寸是10x10
    fig,axs=plt.subplots(10,10,sharex=True,sharey=True,figsize=(10,10))
    for row in range(10):
        for col in range(10):
            axs[row,col].matshow(images[row*10+col].reshape(20,20),cmap='gray_r')
    plt.xticks([])
    plt.yticks([])
    plt.show()


# 将y由列向量转换为矩阵
def matrix_y(y):
    mat_y=[]
    for i in y:
        v_y=np.zeros(10)
        v_y[i-1]=1
        mat_y.append(v_y)
    return np.array(mat_y)


# 将两个二维矩阵合并为一个一维矩阵，针对theta以及j的偏导数D
def de_dimension(m1,m2):
    return np.r_[m1.ravel(),m2.ravel()]


# 将一个一维矩阵拆成两个二维矩阵，针对theta和theta的偏导
def up_dimension(theta):
    theta1 = theta[:401 * 25].reshape(401, 25)
    theta2 = theta[401 * 25:].reshape(26, 10)
    return theta1,theta2


# 三层神经网络(400,25,10)，前向传播
def feed_forward(theta,x):
    theta1,theta2=up_dimension(theta)
    a1=x
    z2=a1.dot(theta1)
    a2=sigmoid(z2)
    a2=np.c_[np.ones(a2.shape[0]),a2]
    z3=a2.dot(theta2)
    a3=sigmoid(z3)
    return a1,z2,a2,z3,a3


# 代价函数，用for循环实现，此方法速度较慢
def cost_fun(theta,x,y):
    a1,z2,a2,z3,h=feed_forward(theta,x)
    j=0
    for i in range(len(x)):
        first=y[i].dot(np.log(h[i]))
        second=(1-y[i]).dot(np.log(1-h[i]))
        j=j+first+second
    j=j/-len(x)
    return j


# 代价函数，向量化实现，推荐
def cost_fun_vector(theta,x,y):
    a1, z2, a2, z3, h = feed_forward(theta, x)
    j=np.sum(y*(np.log(h))+(1-y)*(np.log(1-h)))/(-len(x))
    return j


# 正则化代价函数，向量化实现，推荐
def regularized_cost_vector(theta,x,y,lam=1):
    theta1, theta2 = up_dimension(theta)
    t=np.sum(np.power(theta1,2))+np.sum(np.power(theta2,2))  # 此处需要考虑theta第一行
    t1=t*lam/(2*len(x))
    return cost_fun_vector(theta,x,y)+t1


# 正则化代价函数，含有for循环速度较慢
def regularized_cost(theta,x,y,lam=1):
    theta1,theta2=up_dimension(theta)
    t=np.sum(np.power(theta1,2))+np.sum(np.power(theta2,2))
    t1=t*lam/(2*len(x))
    return cost_fun(theta,x,y)+t1


# sigmoid函数对z求偏导
def partial_sigmoid(z):
    return sigmoid(z)*(1-sigmoid(z))


# 反向传播：代价函数第一项对theta求偏导
def gradient(theta,x,y):
    a1, z2, a2, z3, h=feed_forward(theta,x)
    theta1,theta2=up_dimension(theta)
    d3=h-y  # (5000, 10)
    # 由于维度问题，需要删去theta2的一列
    d2= d3 @ (theta2.T[:, 1:]) * partial_sigmoid(z2)  # (5000, 25)
    D2=(a2.T)@d3  # (26, 10),theta2的偏导数
    D1=(a1.T)@d2  # (401, 25),theta1的偏导数
    D=de_dimension(D1,D2)/len(x)
    return D


# 正则化的代价函数对theta求偏导
def regularized_gradient(theta,x,y,lam=1):
    m=len(x)
    D = gradient(theta, x, y)
    theta1,theta2=up_dimension(theta)
    theta1[0]=0
    theta2[0]=0
    '''
    D1,D2=up_dimension(D)
    D1=D1+theta1*lam/m
    D2=D2+theta2*lam/m
    reg_D=de_dimension(D1,D2)
    '''
    theta=de_dimension(theta1,theta2)
    reg_D=D+theta*lam/m
    return reg_D


# 梯度检验
def gradient_check(theta,x,y,e=0.0001):
    p_theta=[]  # 存放每一个theta的偏导数
    for i in range(len(theta)):
        theta1,theta2=up_dimension(theta)
        list1=np.array(range(0,theta1.shape[1]))
        list2=np.array(range(theta1.size,theta1.size+theta2.shape[1]))
        add=theta.copy()
        sub=theta.copy()
        add[i]=add[i]+e
        sub[i]=sub[i]-e
        """
        用偏导数的定义求偏导，此处强烈建议用向量化的代价函数，如果用循环的代价函数，循环次数为5k万+次
        用循环的代价函数，执行时间30+min，向量化大概8min左右
        
        """
        if i in list1:
            p_t = (cost_fun_vector(add, x, y) - cost_fun_vector(sub, x, y)) / (2 * e)
        elif i in list2:
            p_t = (cost_fun_vector(add, x, y) - cost_fun_vector(sub, x, y)) / (2 * e)
        else:
            p_t=(regularized_cost_vector(add,x,y)-regularized_cost_vector(sub,x,y))/(2*e)
        p_theta.append(p_t)
    p_thetam=np.array(p_theta)
    reg_D=regularized_gradient(theta,x,y)
    # 通常用欧氏距离衡量
    difference=np.linalg.norm(reg_D-p_thetam)/(np.linalg.norm(reg_D)+np.linalg.norm(p_thetam))
    print("梯度检测误差(小于0.0001(e)合格):",difference)


# 预测结果
def predict(theta,x):
    a1, z2, a2, z3, h=feed_forward(theta,x)
    # 从h中的每一行选择最大的值，将其下标+1放入y中
    y=np.argmax(h,axis=1)+1
    return y


# 根据行列随机生成theta，范围是-0.3～0.3，一个生成row*col个元素
def random_theta(row,col):
    return np.array(np.random.uniform(-0.3,0.3,row*col)).reshape(row,col)


def main():
    start = time.process_time()
    rawdata=raw_data('venv/lib/dataset/ex3data1.mat')
    x1=rawdata['X']
    x=np.c_[np.ones(x1.shape[0]),x1]
    y1=rawdata['y'].ravel()
    y=matrix_y(y1)
    random_100_image(x1)
    # print(x.shape,y.shape)  # (5000, 401) (5000, 10)
    theta1=random_theta(401,25)
    theta2=random_theta(26,10)
    # print(theta1.shape,theta2.shape)  # (401, 25) (26, 10)
    theta=de_dimension(theta1,theta2)
    gradient_check(theta,x,y)  # 确定正确后立刻关闭，耗时，慎用
    result=opt.minimize(fun=regularized_cost_vector,x0=theta,args=(x,y),method='tnc',jac=regularized_gradient)
    theta=result.x
    print(classification_report(y1,predict(theta,x)))
    end = time.process_time()
    print("Program run time is",(end - start)/60, "minutes")
    '''
    theta=raw_data('venv/lib/dataset/ex3weights.mat')
    theta=de_dimension(theta['Theta1'].T,theta['Theta2'].T)
    print(regularized_cost(theta,x,y))
    '''


main()

随机生成100张图片
先从0～5000中随机挑选出100张图片（选择100个数字当作x矩阵的行下标），然后对选择的行进行reshape到（20，20），转化为灰度图像输出即可

# 随机生成100张图片
def random_100_image(x):
    indexs=np.random.choice(x.shape[0],100)
    images=x[indexs]
    # 将幕布分成10x10份，每一份尺寸是10x10
    fig,axs=plt.subplots(10,10,sharex=True,sharey=True,figsize=(10,10))
    for row in range(10):
        for col in range(10):
            axs[row,col].matshow(images[row*10+col].reshape(20,20),cmap='gray_r')
    plt.xticks([])
    plt.yticks([])
    plt.show()

在这里插入图片描述
然后我们要做的是随机生成theta，一共三层网络，所以要生成两个theta矩阵

# 根据行列随机生成theta，范围是-0.3～0.3，一个生成row*col个元素
def random_theta(row,col):
    return np.array(np.random.uniform(-0.3,0.3,row*col)).reshape(row,col)

theta1=random_theta(401,25)
theta2=random_theta(26,10)

然后我们要将这两个theta合并为一个theta向量，后面高级优化方法会用

# 将两个二维矩阵合并为一个一维矩阵，针对theta以及j的偏导数D
def de_dimension(m1,m2):
    return np.r_[m1.ravel(),m2.ravel()]

有合并就要有分离

# 将一个一维矩阵拆成两个二维矩阵，针对theta和theta的偏导
def up_dimension(theta):
    theta1 = theta[:401 * 25].reshape(401, 25)
    theta2 = theta[401 * 25:].reshape(26, 10)
    return theta1,theta2

此时我们将theta合并

theta=de_dimension(theta1,theta2)

还有一件事，我们要将y向量转换为矩阵，其实就是将y的值所对应的位置变为1，其他为0，比如y是3，转为为[00300···]

# 将y由列向量转换为矩阵
def matrix_y(y):
    mat_y=[]
    for i in y:
        v_y=np.zeros(10)
        v_y[i-1]=1
        mat_y.append(v_y)
    return np.array(mat_y)

准备就绪，接下来要做的事就是前向传播，不多说，上次作业已经写过

# 三层神经网络(400,25,10)，前向传播
def feed_forward(theta,x):
    theta1,theta2=up_dimension(theta)
    a1=x
    z2=a1.dot(theta1)
    a2=sigmoid(z2)
    a2=np.c_[np.ones(a2.shape[0]),a2]
    z3=a2.dot(theta2)
    a3=sigmoid(z3)
    return a1,z2,a2,z3,a3

然后，代价函数，这里是向量化实现，源码里有for循环实现

# 代价函数，向量化实现，推荐
def cost_fun_vector(theta,x,y):
    a1, z2, a2, z3, h = feed_forward(theta, x)
    j=np.sum(y*(np.log(h))+(1-y)*(np.log(1-h)))/(-len(x))
    return j

正则化代价函数

# 正则化代价函数，向量化实现，推荐
def regularized_cost_vector(theta,x,y,lam=1):
    theta1, theta2 = up_dimension(theta)
    t=np.sum(np.power(theta1,2))+np.sum(np.power(theta2,2))  # 此处需要考虑theta第一行
    t1=t*lam/(2*len(x))
    return cost_fun_vector(theta,x,y)+t1

关键来了，反向传播，此时要用到sigmoid函数的导数，很容易求

# sigmoid函数对z求偏导
def partial_sigmoid(z):
    return sigmoid(z)*(1-sigmoid(z))

接下来就是 $\frac{\partial J}{\partial \Theta }$ ，这是重点

# 反向传播：代价函数第一项对theta求偏导
def gradient(theta,x,y):
    a1, z2, a2, z3, h=feed_forward(theta,x)
    theta1,theta2=up_dimension(theta)
    d3=h-y  # (5000, 10)
    # 由于维度问题，需要删去theta2的一列
    d2= d3 @ (theta2.T[:, 1:]) * partial_sigmoid(z2)  # (5000, 25)
    D2=(a2.T)@d3  # (26, 10),theta2的偏导数
    D1=(a1.T)@d2  # (401, 25),theta1的偏导数
    D=de_dimension(D1,D2)/len(x)
    return D

然后正则化

# 正则化的代价函数对theta求偏导
def regularized_gradient(theta,x,y,lam=1):
    m=len(x)
    D = gradient(theta, x, y)
    theta1,theta2=up_dimension(theta)
    theta1[0]=0
    theta2[0]=0
    '''
    D1,D2=up_dimension(D)
    D1=D1+theta1*lam/m
    D2=D2+theta2*lam/m
    reg_D=de_dimension(D1,D2)
    '''
    theta=de_dimension(theta1,theta2)
    reg_D=D+theta*lam/m
    return reg_D

这个时候就完成反向传播了，不过不要着急，还有梯度检验，防止出错，梯度检验实际上是用导数的定义 $\lim_{\varepsilon \to 0}\frac{f(\theta +\varepsilon )-f(\theta-\varepsilon )}{2\varepsilon }\approx \frac{df}{d\theta }$ ，我们只需将 $\varepsilon$ 取很小即可，但也不要太小，记住检验完成后若果正确就关闭，太耗时。

# 梯度检验
def gradient_check(theta,x,y,e=0.0001):
    p_theta=[]  # 存放每一个theta的偏导数
    for i in range(len(theta)):
        theta1,theta2=up_dimension(theta)
        list1=np.array(range(0,theta1.shape[1]))
        list2=np.array(range(theta1.size,theta1.size+theta2.shape[1]))
        add=theta.copy()
        sub=theta.copy()
        add[i]=add[i]+e
        sub[i]=sub[i]-e
        """
        用偏导数的定义求偏导，此处强烈建议用向量化的代价函数，如果用循环的代价函数，循环次数为5k万+次
        用循环的代价函数，执行时间30+min，向量化大概8min左右
        """
        # 当theta是第一行(theta0)时，认为j对theta的偏导数为0，
        # 即不惩罚第一项，所以此时不需要对theta求偏导
        if i in list1:
            p_t = (cost_fun_vector(add, x, y) - cost_fun_vector(sub, x, y)) / (2 * e)
        elif i in list2:
            p_t = (cost_fun_vector(add, x, y) - cost_fun_vector(sub, x, y)) / (2 * e)
        else:
            p_t=(regularized_cost_vector(add,x,y)-regularized_cost_vector(sub,x,y))/(2*e)
        p_theta.append(p_t)
    p_thetam=np.array(p_theta)
    reg_D=regularized_gradient(theta,x,y)
    # 通常用欧氏距离衡量,此时误差小于epsilon即认为正确
    difference=np.linalg.norm(reg_D-p_thetam)/(np.linalg.norm(reg_D)+np.linalg.norm(p_thetam))
    print("梯度检测误差(小于0.0001(e)合格):",difference)

梯度检验结果：
在这里插入图片描述
运行了9分钟
关于结果误差如何确定，通常采用 $\frac{\left \| grad-gradapprox \right \|_{2}}{\left \| grad \right \|_{2}+\left \| gradapprox \right \|_{2}}$
其中 $\left \| \right \|_{2}$ 为二范数，分子为欧式距离，该方法确定的误差，若difference<epsilon(e)则反向传播算法正确，否则应检查反向传播算法。

接下来就可以反向传播了

result=opt.minimize(fun=regularized_cost_vector,x0=theta,args=(x,y),method='tnc',jac=regularized_gradient)
theta=result.x

预测

# 预测结果
def predict(theta,x):
    a1, z2, a2, z3, h=feed_forward(theta,x)
    # 从h中的每一行选择最大的值，将其下标+1放入y中
    y=np.argmax(h,axis=1)+1
    return y

精度，召回率，f1-score

print(classification_report(y1,predict(theta,x)))

最后结果95%多的样子
在这里插入图片描述
关于report结果，如果不会看的话，看这里classification_report结果说明
另外，由于每次theta初始化不同，执行时间和结果都会有差异，但差别不大。

Yue9812

关注

5
点赞
踩
32

收藏

觉得还不错? 一键收藏
4
评论
吴恩达机器学习第四次作业（python实现）：BP神经网络

bp神经网络数据放在这：import numpy as npfrom scipy.io import loadmatfrom sklearn.metrics import classification_reportimport matplotlib.pyplot as pltimport scipy.optimize as optimport timedef raw_data...
复制链接

扫一扫