吴恩达-机器学习-神经网络（反向传播）_吴恩达神经网络后向传播算法-CSDN博客

本文链接：https://blog.csdn.net/qq_30182357/article/details/104738274
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import scipy.optimize as opt
from sklearn.metrics import classification_report

#visualizing the data
path = 'C:/Users/liang/Desktop/Coursera-ML-AndrewNg-Notes/ex4data1.mat'

def load_mat(path):
    data = loadmat(path)
    X = data['X']
    y = data['y'].flatten()

    return X,y

def plot_100_image(X):

    index = np.random.choice(range(5000),100)
    images = X[index]
    fig, ax_array = plt.subplots(10,10,sharey=True,sharex=True,figsize=(8,8))
    for r in range(10):
        for c in range(10):
            ax_array[r, c].matshow(images[r*10 + c].reshape(20,20), cmap='gray_r')
    plt.xticks([])
    plt.yticks([])
    plt.show()

X,y = load_mat(path)
plot_100_image(X)

#model presentention
def expend_y(y):
    result = [];

    for i in y:
        y_array = np.zeros(10)
        y_array[i-1] = 1
        result.append(y_array)

    return np.array(result)

raw_X, raw_y = load_mat(path)
X = np.insert(raw_X, 0, 1, axis=1)
y = expend_y(raw_y)
X.shape, y.shape

def load_weight(path):
    data = loadmat(path)

    return data['Theta1'], data['Theta2']

path2 = 'C:/Users/liang/Desktop/Coursera-ML-AndrewNg-Notes/ex4weights.mat'
t1, t2 = load_weight(path2)
t1.shape, t2.shape

def serialize(a, b):
    
    return np.r_[a.flatten(),b.flatten()]

theta = serialize(t1,t2)
theta.shape

def deserialize(seq):
    
    return seq[:25*401].reshape(25, 401), seq[25*401:].reshape(10, 26)


#feedforward and cost function
def sigmoid(z):
    return 1/(1+np.exp(z))

def feed_forward(theta,X):
    t1, t2 = deserialize(theta)
    # 前面已经插入过偏置单元，这里就不用插入了
    a1 = X
    z2 = a1 @ t1.T
    a2 = np.insert(sigmoid(z2), 0, 1, axis=1)
    z3 = a2 @ t2.T
    a3 = sigmoid(z3)
    
    return a1, z2, a2, z3, a3

a1, z2, a2, z3, h = feed_forward(theta, X)

#cost function
def cost(theta, X, y):
    a1, z2, a2, z3, h = feed_forward(theta, X)
    J = 0
    for i in range(len(X)):
        first = - y[i] * np.log(h[i])
        second = (1 - y[i]) * np.log(1 - h[i])
        J = J + np.sum(first - second)
    J = J / len(X)
    return J

cost(theta, X, y) 

def regularized_cost(theta, X, y, l=1):
    '''正则化时忽略每层的偏置项，也就是参数矩阵的第一列'''
    t1, t2 = deserialize(theta)
    reg = np.sum(t1[:,1:] ** 2) + np.sum(t2[:,1:] ** 2)  # or use np.power(a, 2)
    return l / (2 * len(X)) * reg + cost(theta, X, y)

regularized_cost(theta, X, y, 1)

def sigmoid_gradient(z):
    return sigmoid(z) * (1 - sigmoid(z))

def random_init(size):
    '''从服从的均匀分布的范围中随机返回size大小的值'''
    return np.random.uniform(-0.12, 0.12, size)

print('a1', a1.shape,'t1', t1.shape)
print('z2', z2.shape)
print('a2', a2.shape, 't2', t2.shape)
print('z3', z3.shape)
print('a3', h.shape)


def gradient(theta, X, y):
    '''
    unregularized gradient, notice no d1 since the input layer has no error 
    return 所有参数theta的梯度，故梯度D(i)和参数theta(i)同shape，重要。
    '''
    t1, t2 = deserialize(theta)
    a1, z2, a2, z3, h = feed_forward(theta, X)
    d3 = h - y # (5000, 10)
    d2 = d3 @ t2[:,1:] * sigmoid_gradient(z2)  # (5000, 25)
    D2 = d3.T @ a2  # (10, 26)
    D1 = d2.T @ a1 # (25, 401)
    D = (1 / len(X)) * serialize(D1, D2)  # (10285,)
    
    return D

    print('If your backpropagation implementation is correct,\nthe relative difference will be smaller than 10e-9 (assume epsilon=0.0001).\nRelative Difference: {}\n'.format(diff))

gradient_checking(theta, X, y, 0.0001)

def regularized_gradient(theta, X, y, l=1):
    """不惩罚偏置单元的参数"""
    a1, z2, a2, z3, h = feed_forward(theta, X)
    D1, D2 = deserialize(gradient(theta, X, y))
    t1[:,0] = 0
    t2[:,0] = 0
    reg_D1 = D1 + (l / len(X)) * t1
    reg_D2 = D2 + (l / len(X)) * t2
    
    return serialize(reg_D1, reg_D2)

def nn_training(X, y):
    init_theta = random_init(10285)  # 25*401 + 10*26

    res = opt.minimize(fun=regularized_cost,
                       x0=init_theta,
                       args=(X, y, 1),
                       method='TNC',
                       jac=regularized_gradient,
                       options={'maxiter': 400})
    return res

res = nn_training(X, y)
res


def accuracy(theta, X, y):
    _, _, _, _, h = feed_forward(res.x, X)
    y_pred = np.argmax(h, axis=1) + 1
    print(classification_report(y, y_pred))

accuracy(res.x, X, raw_y)

def plot_hidden(theta):
    t1, _ = deserialize(theta)
    t1 = t1[:, 1:]
    fig,ax_array = plt.subplots(5, 5, sharex=True, sharey=True, figsize=(6,6))
    for r in range(5):
        for c in range(5):
            ax_array[r, c].matshow(t1[r * 5 + c].reshape(20, 20), cmap='gray_r')
            plt.xticks([])
            plt.yticks([])
    plt.show()

plot_hidden(res.x)