python实现吴恩达机器学习练习4(神经网络)

Neural Networks Learning

machine learning

吴恩达机器学习编程练习4,用反向传播神经网络识别手写数字。数据集由5000个20*20分辨率的手写数字图片组成。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as io
import scipy.misc
import scipy.optimize as opt
import numpy.linalg as lina
data1 = io.loadmat('D:/python/practise/sample/machine-learning-ex4/data/ex4data1.mat')
X, y = data1['X'], data1['y']
data2 = io.loadmat('D:/python/practise/sample/machine-learning-ex4/data/ex4weights.mat')
Theta_1, Theta_2 = data2['Theta1'], data2['Theta2']
X = np.insert(X, 0, 1, axis = 1)

1 Neural Networks

1.1 visualizing the data

def show_1_number(num):
    testImgarr = X[num,1:].reshape(20, 20).T
    testImgPIL = scipy.misc.toimage(testImgarr)
    plt.figure(figsize = (3, 3))
    plt.imshow(testImgPIL)
show_1_number(33)

在这里插入图片描述

# 完全参照Cowry5师傅的方法,自己实在懒得写了
def plot_100_image(X): #随机画100个数字
    sample_idx = np.random.choice(np.arange(X.shape[0]), 100)  # 随机选100个样本
    sample_images = X[sample_idx, :]  # (100,400)
    
    fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))

    for row in range(10):
        for column in range(10):
            ax_array[row, column].matshow(sample_images[10 * row + column].reshape((20, 20)).T, cmap='gray_r')
    plt.xticks([])
    plt.yticks([])        
    plt.show()
plot_100_image(X[:, 1: ])

在这里插入图片描述

1.3 feedforward and cost function

开始定义函数

def sigmoid(z):
    sigmoid = 1 / (1 + np.exp(-z))
    return sigmoid

神经网络的代价函数(无正则化)求值: J ( θ ) = 1 m ∑ i = 1 m ∑ k = 1 K [ − y k ( i ) ln ⁡ ( ( h θ ( x ( i ) ) ) k ) − ( 1 − y k ( i ) ) ln ⁡ ( 1 − ( h θ ( x ( i ) ) ) k ) ] J(\theta) = \frac{1}{m}\sum^m_{i=1}\sum^K_{k=1}[-y^{(i)}_k\ln((h_{\theta}(x^{(i)}))_k)-(1-y^{(i)}_k)\ln(1-(h_{\theta}(x^{(i)}))_k)] J(θ)=m1i=1mk=1K[yk(i)ln((hθ(x(i)))k)(1yk(i))ln(1(hθ(x(i)))k)]

def h_forward(X, theta_1, theta_2):
    A_1 = X
    Z_2 = A_1.dot(theta_1.T)  # 5000 * 25
    A_2 = sigmoid(Z_2)  # 5000 * 25
    A_2 = np.insert(A_2, 0, 1, axis=1) # 5000 * 26
    Z_3 = A_2.dot(theta_2.T) # 5000 * 10
    A_3 = sigmoid(Z_3) # 5000 * 10
    return A_3
y_test_1 = h_forward(X, Theta_1, Theta_2)
y_test_1[0]
array([1.12661530e-04, 1.74127856e-03, 2.52696959e-03, 1.84032321e-05,
       9.36263860e-03, 3.99270267e-03, 5.51517524e-03, 4.01468105e-04,
       6.48072305e-03, 9.95734012e-01])
y_test_1.shape
(5000, 10)
# Adjust y 
y_adj = y - 1 # y-1 后 即与h(x)值对应
y_ser = pd.Series(y_adj.reshape(-1))
y_matrix = pd.get_dummies(y_ser).values #展开 y
def J_func(X, y_matrix, theta_1, theta_2):
    h = h_forward(X, theta_1, theta_2)
    matrix = -(y_matrix*np.log(h) + (1-y_matrix)*(np.log(1-h)))
    j_value = matrix.sum()/len(X) # 通用函数如sum()不写轴就全元素相加
    return j_value
J_func(X, y_matrix, Theta_1, Theta_2)
0.2876291651613189

神经网络的代价函数(正则化)求值: J ( θ ) = 1 m ∑ i = 1 m ∑ k = 1 K [ − y k ( i ) ln ⁡ ( ( h θ ( x ( i ) ) ) k ) − ( 1 − y k ( i ) ) ln ⁡ ( 1 − ( h θ ( x ( i ) ) ) k ) ] + λ 2 m [ ∑ j = 1 25 ∑ k = 1 400 ( Θ j , k ( 1 ) ) 2 + ∑ j = 1 10 ∑ k = 1 25 ( Θ j , k ( 2 ) ) 2 ] J(\theta) = \frac{1}{m}\sum^m_{i=1}\sum^K_{k=1}[-y^{(i)}_k\ln((h_{\theta}(x^{(i)}))_k)-(1-y^{(i)}_k)\ln(1-(h_{\theta}(x^{(i)}))_k)]+\frac{\lambda}{2m}[\sum^{25}_{j=1}\sum^{400}_{k=1}(\Theta^{(1)}_{j,k})^2+\sum^{10}_{j=1}\sum^{25}_{k=1}(\Theta^{(2)}_{j,k})^2] J(θ)=m1i=1mk=1K[yk(i)ln((hθ(x(i)))k)(1yk(i))ln(1(hθ(x(i)))k)]+2mλ[j=125k=1400(Θj,k(1))2+j=110k=125(Θj,k(2))2]
每一个 θ \theta θ(除了截距)都要算到

def J_func_reg(X, y_matrix, theta_1, theta_2, c=1):
    theta_01 = theta_1[:,1:] # 截距参数不需要正则化
    theta_02 = theta_2[:,1:]
    reg = (np.square(theta_01).sum() + np.square(theta_02).sum()) * c / (2*len(X))
    return J_func(X, y_matrix, theta_1, theta_2) + reg
J_func_reg(X, y_matrix, Theta_1, Theta_2)
0.38376985909092365

2 Backpropagation

from IPython.display import Image
Image(filename = 'C:/Users/dennis/backprop.png')

在这里插入图片描述

2.1 sigmoid gradient

g ′ ( z ) = s i g m o i d ′ ( z ) = ( 1 1 + e − z ) ′ = g ( z ) ⋅ ( 1 − g ( z ) ) = s i g m o i d ( z ) ⋅ ( 1 − s i g m o i d ( z ) ) g^{\prime}(z) = sigmoid^{\prime}(z) = (\frac{1}{1+\rm e^{-z}})^{\prime} = g(z)\cdot(1-g(z)) = sigmoid(z)\cdot(1-sigmoid(z)) g(z)=sigmoid(z)=(1+ez1)=g(z)(1g(z))=sigmoid(z)(1sigmoid(z))

def sigmoid_gradient(z):
    val = sigmoid(z)*(1-sigmoid(z))
    return val
sigmoid_gradient(0) #test
0.25

2.2 random initialize Θ \Theta Θ

INIT_EPSILON = 0.12
theta_1_init = np.random.rand(25, X.shape[1])*2*INIT_EPSILON - INIT_EPSILON
theta_2_init = np.random.rand(10,26)*2*INIT_EPSILON - INIT_EPSILON

2.3 Backpropagation

#单个样本版本的前向传播
def h_forward_multi(x, theta_1, theta_2):
    a_1 = x
    z_2 = a_1.dot(theta_1.T)  # 1 * 25
    a_2 = sigmoid(z_2)  # 1 * 25
    a_2 = np.insert(a_2, 0, 1) # 1 * 26 -- 不写axis=1是因为一维数组不让写轴
    z_3 = a_2.dot(theta_2.T) # 1 * 10
    a_3 = sigmoid(z_3) # 1 * 10
    return a_2, a_3
# adjust my y
y_mine = y.copy() # 要用copy,否则就改变原值了
y_mine[y_mine == 10] = 0
y_mine_ser = pd.Series(y_mine.reshape(-1))
y_mine_matrix = pd.get_dummies(y_mine_ser).values
y_mine_matrix.shape
(5000, 10)

第w个样本的cost函数为: c o s t ( w ) = y ( w ) ln ⁡ h θ ( x ( w ) ) + ( 1 − y ( w ) ) ln ⁡ ( 1 − h θ ( x ( w ) ) ) cost(w)=y^{(w)}\ln h_{\theta}(x^{(w)})+(1-y^{(w)})\ln (1-h_{\theta}(x^{(w)})) cost(w)=y(w)lnhθ(x(w))+(1y(w))ln(1hθ(x(w)))

单个样本的偏导数为:
∂ c o s t ( w ) ∂ θ i j ( l ) = δ i ( l + 1 ) ⋅ a j ( l ) \frac{\partial cost(w)}{\partial\theta_{ij}^{(l)}}=\delta_i^{(l+1)}\cdot a_j^{(l)} θij(l)cost(w)=δi(l+1)aj(l)

在输出层的 δ \delta δ为: δ ( L ) = a ( L ) − y \delta^{(L)}=a^{(L)}-y δ(L)=a(L)y
在隐藏层的 δ \delta δ为: δ ( l ) = ( θ ( l ) ) T ⋅ δ ( l + 1 ) ⋅ ∗ g ′ ( z ( l ) ) \delta^{(l)}=(\theta^{(l)})^T\cdot \delta^{(l+1)}\cdot*g^{\prime}(z^{(l)}) δ(l)=(θ(l))Tδ(l+1)g(z(l))

def backprop_forloop(X, y_mine_matrix, theta_1_init, theta_2_init):
    Delta_1 = np.zeros((25, 401))
    Delta_2 = np.zeros((10, 26))
    for w,t in enumerate(X):
        a_1 = t # 1 * 401
        a_2, a_3 = h_forward_multi(a_1, theta_1_init, theta_2_init)
        delta_3 = (a_3 - y_mine_matrix[w]).reshape((-1,1))  # 10 * 1 
        delta_2 = theta_2_init.T.dot(delta_3)*((a_2*(1-a_2)).reshape((-1,1))) #  26 * 1
        delta_2 = delta_2[1:] # 去掉 δ0 ,偏置项没有δ , 25 * 1

        # accumulate the gradient for all the examples
        Delta_1 = Delta_1 + delta_2.dot(a_1.reshape((1,401)))  # 25 * 401
        Delta_2 = Delta_2 + delta_3.dot(a_2.reshape((1,26)))  # 10 * 26

    D_1 = Delta_1 / len(X)
    D_2 = Delta_2 / len(X)
    return D_1, D_2
D_1, D_2 = backprop_forloop(X, y_mine_matrix, theta_1_init, theta_2_init)

2.4 gradient checking

# 扁平化并连接数据
def unrolling_data(data_1, data_2):
    data_1 = data_1.ravel()
    data_2 = data_2.ravel()
    unrolling_data = np.concatenate([data_1, data_2])
    return unrolling_data
# 折叠并拆开数据
def rolling_data(rolling_data):
    data_1 = rolling_data[ : 25*401].reshape((25,401))
    data_2 = rolling_data[25*401 : ].reshape((10,26))
    return data_1, data_2

补充一个输入扁平化θ的代价函数版本

# 无正则化版本
def J_func_unrolltheta(theta_unrolling, X, y_matrix):
    theta_1, theta_2 = rolling_data(theta_unrolling)
    return J_func(X, y_matrix, theta_1, theta_2)
# 加入正则化版本
def J_func_reg_unrolltheta(theta_unrolling, X, y_matrix, c = 1):
    theta_1, theta_2 = rolling_data(theta_unrolling)
    return J_func_reg(X, y_matrix, theta_1, theta_2, c)
def generate_numgrad(theta_unrolling):
    EPSILON = 1e-4
    gradApprox = np.zeros(len(theta_unrolling))
    
    for i in range(len(theta_unrolling)):
        thetaPlus = theta_unrolling.copy() # 要用copy,否则就改变原值了
        thetaPlus[i] = theta_unrolling[i] + EPSILON
        thetaMinus = theta_unrolling.copy() # 要用copy,否则就改变原值了
        thetaMinus[i] = theta_unrolling[i] - EPSILON
        gradApprox[i] = (J_func_unrolltheta(thetaPlus, X, y_mine_matrix) - J_func_unrolltheta(thetaMinus, X, y_mine_matrix)) / (2*EPSILON)
    
    return gradApprox
def gradient_checking(theta_1, theta_2, D_1, D_2):
    theta_unrolling = unrolling_data(theta_1, theta_2)
    D_unrolling = unrolling_data(D_1, D_2)
    gradApprox = generate_numgrad(theta_unrolling)
        
    Ng_diff = lina.norm(D_unrolling - gradApprox) / lina.norm(D_unrolling + gradApprox)
    return Ng_diff
Ng_diff = gradient_checking(theta_1_init, theta_2_init, D_1, D_2)
#diffrence = np.abs(D_unrolling - gradApprox)
#diffrence.max() # you should see a relative difference that is less than 1e-9   result = 5.939121416886906e-11

# Evaluate the norm of the difference between two solutions.  
# If you have a correct implementation, and assuming you used EPSILON = 0.0001 
# in computeNumericalGradient.m, then diff below should be less than 1e-9

# diff = norm(numgrad-grad)/norm(numgrad+grad);

#Ng_diff = lina.norm(D_unrolling - gradApprox) / lina.norm(D_unrolling + gradApprox)
Ng_diff # 用吴恩达的方法算出的结果
8.953099337199837e-11

2.5 regularized neural network

∂ ∂ Θ i j ( l ) J ( Θ ) = D i j ( l ) = 1 m Δ i j ( l ) − − − − − − − − − − f o r : j = 0 \frac{\partial}{\partial\Theta^{(l)}_{ij}}J(\Theta)=D^{(l)}_{ij}=\frac{1}{m}\Delta^{(l)}_{ij}----------for:j=0 Θij(l)J(Θ)=Dij(l)=m1Δij(l)for:j=0

∂ ∂ Θ i j ( l ) J ( Θ ) = D i j ( l ) = 1 m Δ i j ( l ) + λ m Θ i j ( l ) − − − − − f o r : j ≥ 0 \frac{\partial}{\partial\Theta^{(l)}_{ij}}J(\Theta)=D^{(l)}_{ij}=\frac{1}{m}\Delta^{(l)}_{ij}+\frac{\lambda}{m}\Theta^{(l)}_{ij}-----for:j\geq0 Θij(l)J(Θ)=Dij(l)=m1Δij(l)+mλΘij(l)for:j0

#用for循环累加每个样本的梯度
def backprop_gradient_forloop_reg(theta_unrolling, X, y_mine_matrix, c=1):
    
    theta_1, theta_2 = rolling_data(theta_unrolling)
    
    Delta_1 = np.zeros((25, 401))
    Delta_2 = np.zeros((10, 26))
    for w,t in enumerate(X):
        a_1 = t # 1 * 401
        a_2, a_3 = h_forward_multi(a_1, theta_1, theta_2)
        delta_3 = (a_3 - y_mine_matrix[w]).reshape((-1,1))  # 10 * 1 
        delta_2 = theta_2.T.dot(delta_3)*((a_2*(1-a_2)).reshape((-1,1))) #  26 * 1
        delta_2 = delta_2[1:] # 去掉 δ0 ,偏置项没有δ , 25 * 1

        # accumulate the gradient for all the examples
        Delta_1 = Delta_1 + delta_2.dot(a_1.reshape((1,401)))  # 25 * 401
        Delta_2 = Delta_2 + delta_3.dot(a_2.reshape((1,26)))  # 10 * 26
    
    # 让θ矩阵第一列等于0
    reg_1 = theta_1.copy()
    reg_1[:, 0] = 0
    reg_2 = theta_2.copy()
    reg_2[:, 0] = 0
    
    m = len(X)
    D_1_reg = Delta_1 / m + (c/m) * reg_1
    D_2_reg = Delta_2 / m + (c/m) * reg_2
    return unrolling_data(D_1_reg, D_2_reg)

用矩阵乘法代替for循环,运算速度明显加快

#矩阵版的前向传播
def h_forward_multi_matrix(x, theta_1, theta_2):
    a_1 = x
    z_2 = a_1.dot(theta_1.T)  # 5000 * 25
    a_2 = sigmoid(z_2)  # 5000 * 25
    a_2 = np.insert(a_2, 0, 1, axis = 1) # 5000 * 26
    z_3 = a_2.dot(theta_2.T) # 1 * 10
    a_3 = sigmoid(z_3) # 1 * 10
    return a_2, a_3
#用矩阵方法求梯度总和
def backprop_gradient_matrix_reg(theta_unrolling, X, y_mine_matrix, c=1):
    
    theta_1, theta_2 = rolling_data(theta_unrolling)
    
    a_1 = X # 5000 * 401
    a_2, a_3 = h_forward_multi_matrix(a_1, theta_1, theta_2) # a_2: 5000 * 26 , a_3: 5000 * 10
    delta_3 = (a_3 - y_mine_matrix)  # 5000 * 10 
    delta_2 = (delta_3.dot(theta_2))*((a_2*(1-a_2))) #  5000 * 26
    delta_2 = delta_2[:,1:] # 去掉 δ0 ,偏置项没有δ , 5000 * 25

    # sum the gradient for all the examples via matrix's multiplication
    Delta_1 = delta_2.T.dot(a_1)  # 25 * 401
    Delta_2 = delta_3.T.dot(a_2)  # 10 * 26
    
    # 让θ矩阵第一列等于0
    reg_1 = theta_1.copy()
    reg_1[:, 0] = 0
    reg_2 = theta_2.copy()
    reg_2[:, 0] = 0
    
    m = len(X)
    D_1_reg = Delta_1 / m + (c/m) * reg_1
    D_2_reg = Delta_2 / m + (c/m) * reg_2
    return unrolling_data(D_1_reg, D_2_reg)
theta_init_unrolling = unrolling_data(theta_1_init, theta_2_init)
D_unrolling_reg = backprop_gradient_matrix_reg(theta_init_unrolling, X, y_mine_matrix, c=1)
# 加入正则项后,生成数值梯度也要用正则化的代价函数算
def generate_numgrad_reg(theta_unrolling, c=1):
    EPSILON = 1e-4
    gradApprox = np.zeros(len(theta_unrolling))
    
    for i in range(len(theta_unrolling)):
        thetaPlus = theta_unrolling.copy() # 要用copy,否则就改变原值了
        thetaPlus[i] = theta_unrolling[i] + EPSILON
        thetaMinus = theta_unrolling.copy() # 要用copy,否则就改变原值了
        thetaMinus[i] = theta_unrolling[i] - EPSILON
        gradApprox[i] = (J_func_reg_unrolltheta(thetaPlus, X, y_mine_matrix, c) - J_func_reg_unrolltheta(thetaMinus, X, y_mine_matrix, c)) / (2*EPSILON)
    
    return gradApprox
def gradient_checking_reg(theta_unrolling, D_unrolling_reg, c=1):
    
    gradApprox = generate_numgrad_reg(theta_unrolling, c)
        
    Ng_diff_reg = lina.norm(D_unrolling_reg - gradApprox) / lina.norm(D_unrolling_reg + gradApprox)
    
    return Ng_diff_reg
Ng_diff_reg = gradient_checking_reg(theta_init_unrolling, D_unrolling_reg, c=1)
Ng_diff_reg
9.170318454686764e-11

2.6 learning parameters

def tnc_training(theta, x, y, c):
    res = opt.minimize(fun = J_func_reg_unrolltheta, x0 = theta, args = (x, y, c), method = 'TNC', jac = backprop_gradient_matrix_reg, options = {'maxiter' : 400})
    return res
res = tnc_training(theta_init_unrolling, X, y_mine_matrix, 1.28) # lambda用1.28
res
     fun: 0.36464137778688066
     jac: array([-2.16633907e-04,  4.44557739e-08,  1.08284677e-07, ...,
        3.33837037e-05, -1.29912542e-04,  2.92099707e-05])
 message: 'Max. number of function evaluations reached'
    nfev: 400
     nit: 28
  status: 3
 success: False
       x: array([ 5.07021100e-01,  1.73655367e-04,  4.22987020e-04, ...,
       -1.23295909e+00, -6.60098833e-01, -1.39914331e+00])
def h_forward_more(theta_unrolling, X):
    
    theta_1, theta_2 = rolling_data(theta_unrolling)
    
    a_1 = X
    z_2 = a_1.dot(theta_1.T)  # 5000 * 25
    a_2 = sigmoid(z_2)  # 5000 * 25
    a_2 = np.insert(a_2, 0, 1, axis = 1) # 5000 * 26
    z_3 = a_2.dot(theta_2.T) # 5000 * 10
    a_3 = sigmoid(z_3) # 5000 * 10
    return a_3
y_predict = h_forward_more(res.x, X).argmax(axis=1)
y_mine = y_mine.reshape(-1)
(y_predict == y_mine).mean()
0.9928

3 Visualizing the hidden layer

X.shape
(5000, 401)
def visual_hidden(theta, x_sample):
    theta_1 = rolling_data(theta)[0]
    display_data = x_sample * theta_1 # (1*401) * (25*401) = 25 * 401 利用广播机制传播各行    
    display_data = display_data[:, 1:] # 25 * 400
    
    fig, axes = plt.subplots(5,5, sharex = True, sharey = True, figsize = (6,6))
    
    image_list = []
    for i in range(len(display_data)):
        image_i = scipy.misc.toimage(display_data[i].reshape((20,20)).T)
        image_list.append(image_i)
    a = 0
    for i in range(5):
        for j in range(5):
            axes[i,j].imshow(image_list[a])
            a += 1
    plt.subplots_adjust(wspace = 0, hspace = 0)
visual_hidden(res.x, X[600])

在这里插入图片描述

用下一章ex5交叉验证的思路选择 λ \lambda λ

First, shuffle the original dataset, and then split the shuffled dataset into a trainingset and a cross validation set with the ratio of 7:3.

def randomly_split_dataset(X, y_matrix, ratio_train):
    data = np.c_[X, y_matrix]
    data_ = np.random.permutation(data)
    data_train = data_[ : int(len(X)*ratio_train)]
    data_cv = data_[int(len(X)*ratio_train) : ]
    random_X_train = data_train[: , : 401]
    random_y_train_matrix = data_train[: , 401: ]
    
    random_X_cv = data_cv[: , : 401]
    random_y_cv_matrix = data_cv[: , 401: ]
       
    return random_X_train, random_y_train_matrix, random_X_cv, random_y_cv_matrix

用各个 λ \lambda λ值进行训练,得出各个模型,再计算 J t r a i n ( θ ) 、 J c v ( θ ) J_{train}(\theta)、J_{cv}(\theta) Jtrain(θ)Jcv(θ):
J t r a i n ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 ] J_{train}(\theta)=\frac{1}{2m}\left[\sum^{m}_{i=1}(h_{\theta}(x^{(i)})-y^{(i)})^2\right] Jtrain(θ)=2m1[i=1m(hθ(x(i))y(i))2]

J c v ( θ ) = 1 2 m c v [ ∑ i = 1 m c v ( h θ ( x c v ( i ) ) − y c v ( i ) ) 2 ] J_{cv}(\theta)=\frac{1}{2m_{cv}}\left[\sum^{m_{cv}}_{i=1}(h_{\theta}(x_{cv}^{(i)})-y_{cv}^{(i)})^2\right] Jcv(θ)=2mcv1[i=1mcv(hθ(xcv(i))ycv(i))2]

def single_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, c):
    res = tnc_training(theta_init_unrolling, X_train, y_train_matrix, c)
    
    J_train = J_func_unrolltheta(res.x, X_train, y_train_matrix)
    J_cv = J_func_unrolltheta(res.x, X_cv, y_cv_matrix)
    
    return J_train, J_cv
def more_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, lambda_vec):
    zipped = []
    for c in lambda_vec:
        element = single_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, c)
        zipped.append(element)
    
    J_train, J_cv = zip(* zipped)
    J_train = np.array(J_train)
    J_cv = np.array(J_cv)
    return J_train, J_cv
X_train, y_train_matrix, X_cv, y_cv_matrix = randomly_split_dataset(X, y_mine_matrix, ratio_train = 0.7)
def gen_2times_lambdaVec(start):
    vec = [0, start]
    for i in range(1,11):
        vec.append(vec[-1]*2)
    return vec    
times_lambdaVec = gen_2times_lambdaVec(0.01)
times_lambdaVec
[0, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24]
J_train, J_cv = more_lambda_learn(theta_init_unrolling, X_train, y_train_matrix, X_cv, y_cv_matrix, times_lambdaVec)
D:\Program Files (x86)\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in multiply
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Program Files (x86)\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log
  This is separate from the ipykernel package so we can avoid doing imports until
def lambda_curve(J_train, J_cv, lambda_vec):
    plt.figure(figsize = (9,6))
    plt.plot(lambda_vec, J_train, color = 'b', label = 'J_train')
    plt.plot(lambda_vec, J_cv, color = 'g', label = 'J_cv')
    #plt.xticks(np.arange(11))
    plt.xlabel('lambda')
    plt.ylabel('error')
    plt.grid(True)
    plt.title('Lambda Curve', fontsize = 14)
    plt.legend()
lambda_curve(J_train, J_cv, times_lambdaVec)

在这里插入图片描述

λ \lambda λ取1.28时 J c v ( θ ) J_{cv}(\theta) Jcv(θ)值最小泛化能力最好

实验识别自己手写的数字图片

from PIL import Image as im
img_5 = im.open('D:/python/practise/sample/machine-learning-ex4/handwrite/5.png')
img_6 = im.open('D:/python/practise/sample/machine-learning-ex4/handwrite/6.png')
img_9 = im.open('D:/python/practise/sample/machine-learning-ex4/handwrite/9.png')

fig, ax = plt.subplots(1, 3, figsize = (10, 3))
ax[0].imshow(img_5)
ax[1].imshow(img_6)
ax[2].imshow(img_9)
<matplotlib.image.AxesImage at 0x23b021fa320>

在这里插入图片描述

test = np.array([np.array(img_5).ravel(), np.array(img_6).ravel(), np.array(img_9).ravel()])
test = np.insert(test, 0, 1, axis = 1)
h_forward_more(res.x, test).argmax(axis = 1) #用2.6节的训练结果
array([0, 9, 8], dtype=int64)

泛化性能实在是差!!!

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值