吴恩达机器学习课程:编程练习 | (5) ex5-bias vs variance

偏差与方差

"""
了解算法性能中的偏差和方差概念
案例:利用水库水位变化预测大坝出水量
数据集:ex5data1.mat
机器学习课程的第五个编程练习(第六周讲解内容)
"""

import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def reg_cost(theta, X, y, λ):
    cost = np.sum(np.power((X @ theta - y.flatten()), 2))
    reg = theta[1:] @ theta[1:] * λ

    return (cost + reg) / (2 * len(X))


def reg_gradient(theta, X, y, λ):
    grad = (X @ theta - y.flatten()) @ X
    reg = λ * theta
    reg[0] = 0

    return (grad + reg) / (len(X))


def linear_regression_model(X, y, λ):
    theta = np.ones(X.shape[1])

    res = opt.minimize(fun=reg_cost,
                       x0=theta,
                       args=(X, y, λ),
                       method='TNC',
                       jac=reg_gradient)

    return res.x


def plot_learning_curve(X_train, y_train, X_val, y_val, λ): # 构造样品数量与误差曲线
    x = range(1, len(X_train) + 1)
    training_cost = []
    cv_cost = []
    for i in x:
        res = linear_regression_model(X_train[:i, ], y_train[:i, :], λ)
        training_cost_i = reg_cost(res, X_train[:i, ], y_train[:i, ], λ)
        cv_cost_i = reg_cost(res, X_val, y_val, λ)
        training_cost.append(training_cost_i)
        cv_cost.append(cv_cost_i)
    fig, ax = plt.subplots()
    ax.plot(x, training_cost, label='traning cost')
    ax.plot(x, cv_cost, label='cross_validation cost')
    ax.legend()
    plt.xlabel('number of training examples')
    plt.ylabel('error')


# *****************构造多项式特征****************************
def poly_feature(X, power):
    for i in range(2, power + 1):
        X = np.insert(X, X.shape[1], np.power(X[:, 1], i), axis=1)
    return X


def get_means_stds(X):
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0)
    return means, stds


def feature_normalize(X, means, stds):
    X[:, 1:] = (X[:, 1:] - means[1:]) / stds[1:]

    return X


def plot_data():  # 绘制散点图
    fig, ax = plt.subplots()
    ax.scatter(X_train[:, 1], y_train)
    ax.set(xlabel='change in water level(x)',
           ylabel='water flowing out og the dam(y)')


def plot_poly_fit():
    plot_data()

    x_axis = np.linspace(-60, 60, 100)
    x = x_axis.reshape(100, 1)
    x = np.insert(x, 0, 1, axis=1)
    x = poly_feature(x, power)
    x = feature_normalize(x, train_means, train_stds)
    plt.plot(x_axis, x @ theta_fit, 'r--')


if __name__ == "__main__":
    data = sio.loadmat('ex5data1.mat')
    X_train, y_train = data['X'], data['y']  # (12,1)(12,1)
    X_test, y_test = data['Xtest'], data['ytest']
    print(X_train, y_train.shape)
    df = pd.DataFrame({'water_level': np.reshape(X_train, len(X_train)), 'flow': np.reshape(y_train, len(y_train))})

    # -----------------散点图与线性拟合线(Figure1)-------------------------
    sns.set(context="notebook", style='darkgrid', palette='deep')
    sns.lmplot('water_level', 'flow', data=df, fit_reg=False, height=5)

    X_train = np.insert(X_train, 0, 1, axis=1)  # X_train-->(12,2)
    theta = np.ones(X_train.shape[1])
    # X_train = np.insert(X, 0, 1, axis=1)

    print(reg_cost(theta, X_train, y_train, λ=0))
    print(reg_gradient(theta, X_train, y_train, λ=0))
    final_teta = linear_regression_model(X_train, y_train, λ=0)
    plt.plot(X_train[:, 1], X_train @ final_teta, c='r')  # 拟合线

    # --------------------样本数量与误差的变化曲线(Figure2)----------------
    X_val, y_val = data['Xval'], data['yval']
    X_val = np.insert(X_val, 0, 1, axis=1)
    plot_learning_curve(X_train, y_train, X_val, y_val, λ=0)

    # -------------构造多项式特征,进行多项式回归拟合(Figure3-Figure6)-------------------
    power = 6
    X_test = np.insert(X_test, 0, 1, axis=1)
    X_train_poly = poly_feature(X_train, power)
    X_val_poly = poly_feature(X_val, power)
    X_test_poly = poly_feature(X_test, power)

    train_means, train_stds = get_means_stds(X_train_poly)  # 每一列(特征)的均值和均方根误差

    X_train_norm = feature_normalize(X_train_poly, train_means, train_stds)
    X_val_norm = feature_normalize(X_val_poly, train_means, train_stds)
    X_test_norm = feature_normalize(X_test_poly, train_means, train_stds)

    theta_fit = linear_regression_model(X_train_norm, y_train, λ=0)
    plot_poly_fit() # Figure3
    plot_learning_curve(X_train_norm, y_train, X_val_norm, y_val, λ=0) # Figure4 过拟合

    theta_fit = linear_regression_model(X_train_norm, y_train, λ=100)
    plot_poly_fit() # Figure5
    plot_learning_curve(X_train_norm, y_train, X_val_norm, y_val, λ=100) # Figure6正则化系数过大,变成欠拟合了

    # -------------------------找最佳的λ Figure7------------------------------
    λ_values = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
    training_cost, cv_cost = [], []
    for λ in λ_values:
        res_x = linear_regression_model(X_train_norm, y_train, λ)

        tc = reg_cost(res_x, X_train_norm, y_train, 0)
        cv = reg_cost(res_x, X_val_norm, y_val, 0)

        training_cost.append(tc)
        cv_cost.append(cv)

    fig_1, ax1 = plt.subplots()
    plt.plot(λ_values, training_cost, label='training cost')
    plt.plot(λ_values, cv_cost, label='cv cost')
    plt.legend(loc=2)

    plt.xlabel('λ')
    plt.ylabel('cost')
    λ_fit = λ_values[np.argmin(cv_cost)]
    print(λ_fit) # 调参后,  𝜆=0.3  是最优选择,这个时候测试代价最小
    # use test data to compute the cost--测试集-
    for λ in λ_values:
        theta_ = linear_regression_model(X_train_norm, y_train, λ)
        print('test cost(λ={}) = {}'.format(λ, reg_cost(theta_, X_test_norm, y_test, 0)))

    plt.show()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值