机器学习——线性回归实验分析-CSDN博客

本文链接：https://blog.csdn.net/weixin_57205312/article/details/140893926

线性回归

**算法推导过程中已经给出了求解方法，基于最小二乘法直接求解，但这幷不是机器学习的思想，由此引入了梯度下降方法。本次实验课重点讲解其中每一步流程与实验对比分析。**

首先，导入工具包

import numpy as np
import os
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
# 字体大小设置
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# 过滤警告
import warnings
warnings.filterwarnings('ignore')

然后定义X和y

import numpy as np
X = 2*np.random.rand(100,1)
y = 4+3*X + np.random.randn(100,1)

然后可以将这些点的分布画出来：

# b.  的解释： b表示颜色，点(.)表示线条格式
plt.plot(X,y,'b.')
plt.xlabel('X_1')
plt.ylabel('y')
# 设置X,y的取值范围
plt.axis([0,2,0,15])
plt.show()

结果：

方法一：采用参数直接求解的方法

# 参数直接求解方法
# 现在数据列左侧拼上一列1，numpy中c_表示按列添加，a_表示按行添加
X_b = np.c_[np.ones((100,1)),X]
# 矩阵求逆方法：numpy中工具包:np.linalg.inv()
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

可以先将theta_best打印

theta_best

结果：

array([[4.12216181],
       [2.90957555]])

然后进行预测：

# 任取两个数据点，np.array([[0],[2]])表示起点是0，终点是2
X_new = np.array([[0],[2]])
# 然后便进行预测
# 测试的步骤跟训练的步骤必须一模一样，所以先加一列1,便得到测试数据
X_new_b = np.c_[np.ones((2,1)),X_new]
# 用当前的测试数据乘以权重参数，便能得到最终的结果，即X乘以theta=y
y_predict = X_new_b.dot(theta_best)
y_predict

结果：

array([[4.12216181],
       [9.94131292]])

然后就可以进行回归方程的绘制：

# 现在就可以进行回归方程的绘制
plt.plot(X_new,y_predict,'r--')
# 先绘制原始数据的散点
plt.plot(X,y,'b.')
plt.axis([0,2,0,15])
plt.show()

结果：

方法二：使用sklearn工具包

sklearn api文档：API Reference — scikit-learn 1.5.1 documentation

from sklearn.linear_model import LinearRegression
# 首先导入方法，并进行实例化
lin_reg = LinearRegression()
# 然后进行训练，使用fit方法
lin_reg.fit(X,y)
# 然后可以调用权重函数和偏执函数，并返回结果，如下：
print(lin_reg.coef_)
print(lin_reg.intercept_)
# 这两个函数的值和二分法中的theta_best的值是相等的，所以这两种方法是一致的，工具包更简便

结果：

[[2.90957555]]
[4.12216181]

梯度下降

批量梯度下降

首先定义学习率和迭代次数：

# 学习率eta
eta = 0.1
# 迭代次数n_iterations
n_iterations = 1000
# 选择样本的个数
m = 100
# 首先对权重参数进行随机的初始化
theta = np.random.randn(2,1)
# 然后进行迭代
for iteration in range(n_iterations):
    # 每次迭代都需要计算一下当前的梯度 , (1/m)或者(2/m)都一样，没有影响,此处用的依旧是上面的数据
    gradients = 2/m* X_b.T.dot((X_b.dot(theta)-y))
    # 然后进行梯度更新，theta-学习率（learningRate）*梯度（gradients）
    theta = theta - eta*gradients

观察theta:

theta    #跟之前直接算得到的结果一样

结果：

array([[4.12216181],
       [2.90957555]])

# 用测试集乘以theta就可以得到最终预测的结果值，此处跟最开始二分法得到的结果值一致
X_new_b.dot(theta)

结果：

array([[4.12216181],
       [9.94131292]])

不同学习率对结果的影响

theta_path_bgd = []
# 通过画图展示不同的学习率对结果的影响
def plot_gradient_descent(theta,eta,theta_path = None):
    # 首先计算样本的个数
    m = len(X_b)
    # 先将原始数据画出来
    plt.plot(X,y,'b.')
    # 然后指定迭代次数
    n_iterations = 1000
    # 然后进行遍历，
    for iteration in range(n_iterations):
        # 得到每一次迭代的预测值
        y_predict = X_new_b.dot(theta)
        # 画图
        plt.plot(X_new,y_predict,'b-')
        # 每次迭代都需要计算一下当前的梯度 , (1/m)或者(2/m)都一样，没有影响,此处用的依旧是上面的数据
        gradients = 2/m* X_b.T.dot((X_b.dot(theta)-y))
        # 然后进行梯度更新，theta-学习率（learningRate）*梯度（gradients）
        theta = theta - eta*gradients
        # 将每一步的theta保存起来
        if theta_path is not None:
            theta_path.append(theta)
    plt.xlabel('X_1')
    plt.axis([0,2,0,15])
    plt.title('eta = {}'.format(eta))

然后调用上述函数，观察对比实验结果：

# 调用上述函数，做对比实验
# 首先随机初始化tahta值
theta = np.random.randn(2,1)  # 一个偏置参数一个权重参数，所以是2列
# 指定图的大小,画大一点，因为里面有子图
plt.figure(figsize=(10,4))
# 画子图，画一行，画三个，首先画第一个：
plt.subplot(131)
plot_gradient_descent(theta,eta = 0.02)
# 画第二个子图
plt.subplot(132)
plot_gradient_descent(theta,eta = 0.1,theta_path=theta_path_bgd)
# 画第三个子图
plt.subplot(133)
plot_gradient_descent(theta,eta = 0.5)
plt.show()

结果：

随机梯度下降

# 随机梯度下降
theta_path_sgd = []
# 计算样本数量
m = len(X_b)
# 然后开始迭代
n_epochs = 50
# 指定衰减策略，学习率应逐渐衰减
t0 = 5
t1 = 50  #上述两个值为随机指定
# 学习率衰减函数
def learning_schedule(t):
    return t0/(t1+t)   #t0,t1不变，但是t在增大，所以可以实现学习率的衰减

# 进行theta的初始化
theta = np.random.randn(2,1)

for epoch in range(n_epochs):
    # 一个epoch 等于遍历了一遍数据集中的每一个样本，这样才算完成
    # 下面内层循环就开始对所有样本进行遍历
    for i in range(m):
        # 只展示前十个epoch值的前十次
        if epoch < 10 and i<10:
            y_predict = X_new_b.dot(theta)
            plt.plot(X_new,y_predict,'r-')
        # 选数据随机进行迭代
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        # 然后计算梯度
        gradients = 2* xi.T.dot((xi.dot(theta)-yi))
        # 计算完梯度，酒宴进行学习率的衰减
        # n_epochs*m+i表示当前迭代了多少次
        eta = learning_schedule(n_epochs*m+i) 
        # 然后更新theta参数
        theta = theta - eta*gradients
        # 将当前的theta值传入list中
        theta_path_sgd.append(theta)

# 然后就可以进行展示
# 首先还是原始数据
plt.plot(X,y,'b.')
plt.axis([0,2,0,15])
plt.show()
# 每次执行结果不太一样，因为当前是随机的梯度下降

结果：

小批量梯度下降：MiniBatch

theta_path_mgd = []
# 迭代次数
n_epochs= 50
# 指定minibatch的值
minibatch = 16
# theta随机初始化
theta = np.random.randn(2,1)

# 可以自己指定随机的种子，每次随机选择的结果都是一样的，所以最后的结果也都一样，相同的theta相同的minibatch....
# np.random.seed(0)

t = 0  #计数器，记录当前迭代多少次，因为下面每次都进行了加1的操作
for epoch in range(n_epochs):
    # 先对数据进行洗牌shuffled,打乱后的索引记为shuffled_indices,此处使用np.random.permutation方法，对m个值的索引进行打乱
    shuffled_indices = np.random.permutation(m)
    # 将索引回传到原始数据中
    X_b_shuffled = X_b[shuffled_indices]
    # y的索引顺序也发生改变
    y_shuffled = y[shuffled_indices]
    # range(0,m,minibatch)表示从第0个到第m个样本，每次大小就是minibatch
    for i in range(0,m,minibatch):
        t+=1
        xi = X_b_shuffled[i:i+minibatch]
        yi = y_shuffled[i:i+minibatch]
        # 梯度计算
        gradients = 2/minibatch* xi.T.dot((xi.dot(theta)-yi))
        # 计算完梯度，酒宴进行学习率的衰减
        # n_epochs*m+i表示当前迭代了多少次
        eta = learning_schedule(n_epochs*m+i) 
        # 然后更新theta参数
        theta = theta - eta*gradients
        # 将当前的theta值传入list中
        theta_path_mgd.append(theta)

可以观察一下theta值：

theta

结果：

array([[3.3035261 ],
       [1.50742677]])

3种策略的对比实验

首先拿到上述保存的三组实验的数据：

# 首先拿到三组数据
theta_path_bgd = np.array(theta_path_bgd)
theta_path_sgd = np.array(theta_path_sgd)
theta_path_mgd = np.array(theta_path_mgd)

然后便可以开始画图进行比较：

# 然后进行画图
# 首先指定大小，画在一个图里
plt.figure(figsize=(8,4))
plt.plot(theta_path_sgd[:,0],theta_path_sgd[:,1],'r-s',linewidth=1,label='SGD')
plt.plot(theta_path_mgd[:,0],theta_path_mgd[:,1],'g-+',linewidth=2,label='MINIGD')
plt.plot(theta_path_bgd[:,0],theta_path_bgd[:,1],'b-o',linewidth=3,label='BGD')
plt.legend(loc='upper left')
# plt.axis([2.5,4.5,1.0,4.0])
plt.show()

结果：

实际当中用minibatch比较多，一般情况下选择batch数量应当越大越好。

多项式回归

此节不再使用上述数据，采用复杂一点的数据

m = 100
# 6*np.random.rand(m,1)-3表示随机值是从-3到3的一个范围
X = 6*np.random.rand(m,1)-3
y = 0.5*X**2+X+np.random.randn(m,1)

plt.plot(X,y,'b.')
plt.xlabel('X_1')
plt.ylabel('y')
# 设置X,y的取值范围
plt.axis([-3,3,-5,10])
plt.show()

图展示：

# 首先调用工具包，PolynomialFeatures可以对数据进行多项式特征生成。它可以通过创建原始特征的交互项来扩展输入数据集，从而提高模型的拟合能力。
from sklearn.preprocessing import PolynomialFeatures
# 然后进行实例化
# degree = 2表示加入二次幂,include_bias=False表示不加偏置项
poly_features= PolynomialFeatures(degree = 2,include_bias=False)
# fit执行计算，transform整合结果并将结果回传
X_poly = poly_features.fit_transform(X)
X[0]

结果：array([-1.78671997])

X_poly[0]     #就表示  [X,X**2]

结果：

array([-1.78671997,  3.19236823])

然后导入线性回归，得到权重参数和偏置参数

# 导入线性回归
from sklearn.linear_model import LinearRegression
# 首先导入方法，并进行实例化
lin_reg = LinearRegression()
# 然后进行训练，使用fit方法
lin_reg.fit(X_poly,y)
# 然后可以调用权重参数和偏置参数，并返回结果，如下：
print(lin_reg.coef_)          #得到权重参数，例如下面得到的结果，就表示：y=1.03*x+0.488x**2+0.07
print(lin_reg.intercept_)     #得到偏置参数

结果：

[[1.03723665 0.48831041]]
[0.07033412]

选择测试数据，进行画图：

# 选择测试数据，进行画图，此时是曲线，两个参数不够，所以多做几个参数
# np.linspace(-3,3,100)表示从-3到3中等距选择100个数据
X_new = np.linspace(-3,3,100).reshape(100,1)
# 测试的步骤跟训练的步骤必须一模一样
X_new_poly = poly_features.transform(X_new)
# 然后调用线性回归，进行预测
y_new = lin_reg.predict(X_new_poly)
plt.plot(X,y,'b.')   #先画散点
plt.plot(X_new,y_new,'r--',label = 'prediction')
plt.axis([-3,3,-5,10])
plt.legend(loc='upper left')
plt.show()

结果：

对比试验，测试不同的degree值

调用工具包，进行流水线的数据处理

# 调用工具包，进行流水线的数据处理
from sklearn.pipeline import Pipeline
# 调用数据标准化工具包
from sklearn.preprocessing import StandardScaler
plt.figure(figsize=(12,6))
# 进行循环，测试不同数据的结果
for style, width, degree in(('g-',1,100),('b--',1,2),('r-+',1,1)):
    poly_features= PolynomialFeatures(degree = degree,include_bias=False)
    # 先将所有对象进行标准化操作
    std = StandardScaler()
    # 线性回归，先将对象实例化
    lin_reg = LinearRegression()
    # 进入流水线车间
    polynomial_reg = Pipeline([('poly_features',poly_features),       #（‘执行操作的名字’，执行操作的对象）
             ('StandardScaler',std),
             ('lin_reg',lin_reg)])
    polynomial_reg.fit(X,y)
    # 然后就可以进行预测
    y_new_2 = polynomial_reg.predict(X_new)
    plt.plot(X_new,y_new_2,style,label='degree '+str(degree),linewidth=width)
# 画原始数据
plt.plot(X,y,'b.')
plt.axis([-3,3,-5,10])
plt.legend(loc='upper center')
plt.show()
# degree越大越容易过拟合，2，3即可

结果：

特征变换的越复杂，得到的结果过拟合风险越高，不建议做的特别复杂。

数据样本数量对结果的影响

# 导入工具包,用于计算均方误差
from sklearn.metrics import mean_squared_error
# 将训练集和测试集分割
from sklearn.model_selection import train_test_split
# 定义函数，画出不同的学习曲线
def plot_learning_curves(model,X,y):
    # 第一步进行训练集和测试集的切分,random_state=0,指定种子，每次切分方式都一样
    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=0)
    # 做评估，先保存errors
    train_errors, val_errors = [],[]
    for m in range(1,len(X_train)):
        # 先进行训练
        model.fit(X_train[:m],y_train[:m])
        # 得到训练集的预测结果
        y_train_predict = model.predict(X_train[:m])
        # 得到验证集的预测结果
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m],y_train_predict[:m]))
        val_errors.append(mean_squared_error(y_val,y_val_predict))

# mse抖动可能太大，所以一般需要开根号
    plt.plot(np.sqrt(train_errors),'r-+',linewidth=2,label='train_error')
    plt.plot(np.sqrt(val_errors),'b-',linewidth=3,label='val_error')
    plt.xlabel('Training set size')
    plt.ylabel('RMSE')
    plt.legend()

# 依旧使用上面线性回归的model
lin_reg = LinearRegression()
plot_learning_curves(lin_reg,X,y)
plt.axis([0,80,0,3.5])
plt.show()

结果：

数据量越少，训练集的效果会越好，但是实际测试效果很一般。实做模型的时候需要参考测试集和验证集的效果。

正则化

对权重参数进行惩罚，让权重参数尽可能平滑一些，有两种不同的方法来进行正则化惩罚：岭回归和lasso

岭回归

# 先用岭回归做实验,导入岭回归工具包
from sklearn.linear_model import Ridge
# 重新构造一份数据集
np.random.seed(42)
m = 20
X = 3*np.random.rand(m,1)
y = 0.5*X+np.random.randn(m,1)/1.5 + 1
X_new = np.linspace(0,3,100).reshape(100,1)

def plot_model(model_class,polynomial,alphas,**model_kargs):
    # 然后进行对比实验
    # zip的作用就是将alpha与后面的颜色格式一一对应起来
    for alpha,style in zip(alphas,('b-','g--','r:')):
        # 初始化model
        model = model_class(alpha,**model_kargs)
        if polynomial:
            # 如果polynomial是True，则需要做流水线，PolynomialFeatures的作用是将原始特征转换为多项式特征
            model = Pipeline([('poly_features',PolynomialFeatures(degree=10,include_bias=False)),       #（‘执行操作的名字’，执行操作的对象）
             ('StandardScaler',StandardScaler()),
             ('lin_reg',model)])
        model.fit(X,y)
        # 得到y正则化后的结果y_new_regul
        y_new_regul = model.predict(X_new)
        lw = 2 if alpha>0 else 1
        plt.plot(X_new,y_new_regul,style,linewidth = lw, label = 'alpha = {}'.format(alpha))
    plt.plot(X,y,'b.',linewidth=3)
    plt.legend()

plt.figure(figsize=(14,6))
plt.subplot(121)
plot_model(Ridge,polynomial=False,alphas=(0,10,100))
plt.subplot(122)
plot_model(Ridge,polynomial=True,alphas=(0,10**-5,1))
plt.show()

惩罚力度越大，alpha值越大的时候，得到的决策方程越平稳。

Lasso

from sklearn.linear_model import Lasso

plt.figure(figsize=(14,6))
plt.subplot(121)
plot_model(Ridge,polynomial=False,alphas=(0,1,10))
plt.subplot(122)
plot_model(Ridge,polynomial=True,alphas=(0,10**-1,1))
plt.show()