[Python嗯~机器学习]---用python3来做机器学习模型诊断

最新推荐文章于 2023-05-31 17:02:48 发布

鹏鹏哥哥的小红帽

最新推荐文章于 2023-05-31 17:02:48 发布

阅读量604

点赞数

分类专栏：鹏鹏哥哥的机器学习文章标签：机器学习 python 诊断

本文链接：https://blog.csdn.net/kepengs/article/details/84995985

版权

鹏鹏哥哥的机器学习专栏收录该内容

52 篇文章 11 订阅

订阅专栏

诊断机器学习

我们用代码来回顾一下上一个博客中关于参数选择的问题

In [1]:

import numpy as np
import matplotlib.pyplot as plt                          # 画图
from scipy.io import loadmat                             # 用scipy中的模块加载matlab的格式数据
from sklearn.preprocessing import PolynomialFeatures     # 用sklearn中的模块来特征构造

In [2]:

from pylab import mpl                                    # 画图                           

mpl.rcParams['font.sans-serif'] = ['SimHei']             # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False               # 解决保存图像是负号'-'显示为方块的问题

假设函数

In [3]:

def h(theta, x):
    """预测函数
    特征和参数都是一个列向量

    Args:
        theta 模型参数
        x 特征向量

    Returns:
        预测结果
    """
    return (theta.T * x)[0, 0]                           # theta.T * x 算出来的结果是一个1x1 的矩阵，(theta.T * x)[0, 0]取矩阵中的数值

代价函数

In [4]:

def J(theta, X, y, theLambda=0):
    """代价函数

    Args:
        theta 模型参数
        X 样本特征
        y 样本标签

    Returns:
        预测误差（代价）
    """
    m = len(X)
    
    # X * theta - y 是一个 m x 1 维的矩阵
    return (X * theta - y).T * (X * theta - y) / (2 * m) + theLambda * np.sum(np.square(theta)) / (2*m)

梯度下降

In [5]:

def gradient(X, y, alpha=1, maxLoop=50, epsilon=1e-5, theLambda=0, initTheta=None):
    """批量梯度下降法

    Args:
        X 样本特征
        y 样本标签
        alpha 学习率
        maxLoop 最大迭代次数
        epsilon 收敛精度
        theLambda 正则化参数
    Returns:
        theta, errors
    """
    m, n = X.shape
    # 初始化theta
    if initTheta is None:
        theta = np.zeros((n, 1))
    else:
        theta = initTheta
    count = 0
    
    error = float('inf')
    errors = [error,]                                                    # 转换为列表数据类型
    for i in range(maxLoop):
        theta = theta + (1.0 / m) * alpha * ((y - X * theta).T * X).T
        error = J(theta, X, y, theLambda)
        if np.isnan(error):
            error = np.inf
        
        errors.append(error)
        # 如果已经收敛
        if(abs(errors[-1]-errors[-2]) < epsilon):
            break
    print('iterating',i)
    return theta, errors

多个特征，不同数量级处理数据到一个数量级，加快迭代

In [6]:

def normalize(X):
    """特征归一化处理

    Args:
        X 样本集
    Returns:
        归一化后的样本集
    """
    m, n = X.shape
    # 归一化每一个特征
    for j in range(n):
        features = X[:,j]
        minVal = features.min(axis=0)
        maxVal = features.max(axis=0)
        diff = maxVal - minVal
        if diff != 0:
           X[:,j] = (features-minVal)/diff
        else:
           X[:,j] = 0
    return X

加载数据

In [7]:

data = loadmat('data/water.mat')
#####
# 数据集划分
#####
# 训练集
X = np.mat(data['X'])
# 添加偏置
X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
y = np.mat(data['y'])
# 交叉验证集
Xval = np.mat(data['Xval'])
Xval = np.concatenate((np.ones((Xval.shape[0], 1)), Xval), axis=1)
yval = np.mat(data['yval'])
# 测试集
Xtest = np.mat(data['Xtest'])
Xtest = np.concatenate((np.ones((Xtest.shape[0], 1)), Xtest), axis=1)
ytest = np.mat(data['ytest'])

In [8]:

X.shape[1]

Out[8]:

线性回归诊断

In [9]:

def diagnoseLR():
    """线性回归诊断
    """
    initTheta = np.mat(np.ones((X.shape[1], 1)))
    theta, errors = gradient(
        X, y, alpha=0.001, maxLoop=5000, epsilon=0.00001, initTheta=initTheta)

    # 绘制拟合成果
    Xmin = X[:, 1].min()
    Xmax = X[:, 1].max()
    ymax = y[:, 0].max()
    ymin = y[:, 0].min()
    fitX = np.mat(np.linspace(Xmin, Xmax, 20).reshape(-1, 1))           # reshape(-1, 1)不关心行数，但是1列
    fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)

    h = fitX * theta
    plt.xlim(Xmin, Xmax)
    plt.ylim(ymin, ymax)
    # 绘制训练样本
    plt.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0],marker='x',color='r', linewidth=2)
    # 绘制拟合曲线
    plt.plot(fitX[:, 1], h, color='b')
    plt.xlabel(u'水位变化(x)')
    plt.ylabel(u'大坝流量(y)')
    plt.show()

    # 绘制随样本规模学习曲线
    m, n = X.shape
    trainErrors = np.zeros((1,m))
    valErrors = np.zeros((1,m))
    for i in range(m):
        Xtrain = X[0:i+1]
        ytrain = y[0:i+1]
        # 注意，这里我们没有设置theLambda，实际上没有必要
        theta, errors = gradient(
            Xtrain, ytrain, alpha=0.001, maxLoop=10000, epsilon=0.00001)
        
        trainErrors[0,i] = J(theta, Xtrain, ytrain)
        valErrors[0,i] = J(theta, Xval, yval)

    print(u'最小交叉验证误差', valErrors.ravel()[-1])
    plt.plot(np.arange(1,m+1).ravel(), trainErrors.ravel(), color='b', label=u'测试误差')
    plt.plot(np.arange(1,m+1).ravel(), valErrors.ravel(), color='g', label=u'交叉验证误差')
    plt.title(u'线性回归学习曲线')
    plt.xlabel(u'训练样本量')
    plt.ylabel(u'误差')
    plt.legend()
    plt.show()

In [10]:

diagnoseLR()

iterating 4911

iterating 20
iterating 9999
iterating 4994
iterating 5207
iterating 5198
iterating 5124
iterating 4973
iterating 5137
iterating 5013
iterating 5294
iterating 5132
iterating 4994
最小交叉验证误差 29.564752654802664

非线性回归诊断

In [11]:

def diagnosePR():
    """多项式回归诊断
    """
    # 多项式回归
    poly = PolynomialFeatures(degree=10)
    XX, XXval, XXtest = [normalize(
        np.mat(poly.fit_transform(data[:, 1:]))) for data in [X, Xval, Xtest]]
    initTheta = np.mat(np.ones((XX.shape[1], 1)))
    theLambdas = [0, 0.01, 0.02, 0.03, 0.04, 0.05,0.06,0.07, 0.08, 0.09,0.10, 0.11, 0.12,0.13,0.14,0.15,0.2,0.3,0.5]
    numTheLambdas = len(theLambdas)
    trainErrors = np.zeros((1, numTheLambdas))
    valErrors = np.zeros((1, numTheLambdas))
    thetas = []
    for idx, theLambda in enumerate(theLambdas):
        theta, errors = gradient(
            XX, y, alpha=0.1, maxLoop=10000, epsilon=0.0001,
            theLambda=theLambda, initTheta=initTheta)
        
        thetas.append(theta)
        # 训练误差、交叉验证误差，不需要考虑模型复杂度，所以theLambda不需要设置或者设置成0
        trainErrors[0, idx] = J(theta,XX,y)
        valErrors[0, idx] = J(theta, XXval, yval)
        print(theLambda, valErrors[0,idx])

    bestLambda = theLambdas[np.argmin(valErrors)]
    theta = thetas[np.argmin(valErrors)]
    error = np.min(valErrors)

    # # 选择lambda
    plt.plot(theLambdas, trainErrors.ravel(), color='b',label=u'训练误差')
    plt.plot(theLambdas, valErrors.ravel(), color='g',label=u'交叉验证误差')
    plt.title(u'多项式回归选择λ')
    plt.xlabel(u'λ')
    plt.ylabel(u'误差')
    plt.legend()
    plt.show()

    # 绘制拟合曲线
    fitX = np.mat(np.linspace(-60, 45).reshape(-1, 1))
    fitX = np.concatenate((np.ones((fitX.shape[0], 1)), fitX), axis=1)
    fitXX = normalize(np.mat(poly.fit_transform(fitX[:, 1:])))
    h = fitXX * theta
    print(theta)
    plt.title(u'多项式回归拟合曲线(lambda=%.3f) \n  交叉验证误差=%.3f' % (bestLambda, error))
    
    plt.scatter(X[:, 1].tolist(), y[:, 0].tolist(), marker='x', color='r', linewidth=3)
    plt.plot(fitX[:, 1], h, color='b')
    plt.show()

In [12]:

diagnosePR()

iterating 1629
0 15.243389466469209
iterating 1376
0.01 15.01259952366252
iterating 1172
0.02 14.761963664239223
iterating 1014
0.03 14.52946148017376
iterating 889
0.04 14.333843279027338
iterating 789
0.05 14.187289975764603
iterating 707
0.06 14.093184875253355
iterating 641
0.07 14.053284486207566
iterating 586
0.08 14.061918063936215
iterating 541
0.09 14.112333334428358
iterating 503
0.1 14.198147212683487
iterating 471
0.11 14.311753596608908
iterating 444
0.12 14.445531704002528
iterating 420
0.13 14.60080965126319
iterating 400
0.14 14.761898131368469
iterating 382
0.15 14.936084971453127
iterating 314
0.2 15.934299059503644
iterating 9999
0.3 16.471240825244045
iterating 164
0.5 22.224487377564365

[[  1.        ]
 [ 33.95102063]
 [ 18.6465778 ]
 [ 12.39603696]
 [  6.03626333]
 [ -0.35284465]
 [ -2.41587214]
 [ -8.76756631]
 [ -7.44441674]
 [-14.36476901]
 [-10.39025965]]

鹏鹏哥哥的小红帽

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
[Python嗯~机器学习]---用python3来做机器学习模型诊断

诊断机器学习我们用代码来回顾一下上一个博客中关于参数选择的问题In [1]:import numpy as npimport matplotlib.pyplot as plt # 画图from scipy.io import loadmat # 用scipy中的模块加载matl...
复制链接

扫一扫