sklearn实现多项式回归

需求文档

1)生成数据集

import numpy as np
import matplotlib.pyplot as plt
n_train, n_test, true_w, true_b = 100, 100, [1.2, -3.4, 5.6], 5   
# X = np.linspace(-3,3,n_train+n_test)
X = np.random.normal(size=(n_train+n_test))
X = X.reshape(-1,1)
poly_features = np.concatenate((X, np.power(X, 2),np.power(X, 3)),axis=1)  
y = (true_w[0] * poly_features[:, 0] + true_w[1] * poly_features[:, 1] + true_w[2] * poly_features[:, 2] + 5)   
y += np.random.normal(scale=0.1, size=y.size) 
plt.plot(X,y,'b.')
plt.show()

在这里插入图片描述

2)三阶多项式函数拟合

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(poly_features[:n_train],y[:n_train])
y_predict = lin_reg.predict(poly_features[n_test:])
plt.plot(np.sort(poly_features[n_test:,0]),y_predict[np.argsort(poly_features[n_test:,0])],'r-')
plt.plot(X,y,'b.')
plt.show()
lin_reg.coef_,lin_reg.intercept_

在这里插入图片描述

(array([ 1.18754391, -3.39746813,  5.60116652]), 4.981659384328852)

①最佳参数:[4.981659384328852, 1.18754391, -3.39746813, 5.60116652]
真实参数:[5.0, 1.2, -3.4, 5.6]。
误差较小,训练模型比较接近真实值。

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=0)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)   
    plt.xlabel("Training set size", fontsize=14) 
    plt.ylabel("RMSE", fontsize=14)      
plot_learning_curves(lin_reg,poly_features, y)
plt.axis([0, 80, 0, 3])                       
plt.show() 

在这里插入图片描述

②模型表现很好

from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty=None, eta0=1, random_state=42)
plot_learning_curves(sgd_reg,poly_features,y)
sgd_reg.intercept_, sgd_reg.coef_
(array([5.43053764e+10]),
 array([ 1.83079539e+11, -5.28474831e+11, -7.16166266e+11]))

在这里插入图片描述

sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty=None, eta0=0.001, random_state=42)
plot_learning_curves(sgd_reg,poly_features,y)
sgd_reg.intercept_, sgd_reg.coef_
(array([1.47656575]), array([ 1.4652543 , -1.55756018,  5.6544919 ]))

在这里插入图片描述

sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty=None, eta0=0.03, random_state=42)
plot_learning_curves(sgd_reg,poly_features,y)
sgd_reg.intercept_, sgd_reg.coef_

(array([4.99299336]), array([ 1.19591598, -3.39576393,  5.60120054]))

在这里插入图片描述

③当学习率η过大时模型在最优解两边来回震荡;当学习率η过小时,收敛过程非常缓慢;当学习率合适时,收敛速度和效果都比较好。

3)线性函数拟合

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
lin_reg = LinearRegression()
lin_reg.fit(X[:n_train],y[:n_train])
y_predict = lin_reg.predict(X[n_test:])
# plt.plot(X[n_test:],y[n_test:],'b.',label='true value')
# plt.plot(np.sort(poly_features[n_test:,0]),y_predict[np.argsort(poly_features[n_test:,0])],'r-')
plt.show()
plot_learning_curves(lin_reg,X,y)

在这里插入图片描述

根据上图中模型在训练集和测试集上的的学习曲线,该模型欠拟合,应增加模型的复杂度

4)三阶多项式函数模型拟合

n_train_two=2

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
ploy_reg = PolynomialFeatures(degree = 3,include_bias=False)
x_train_ploy = ploy_reg.fit_transform(X[:n_train_two])
lin_reg = LinearRegression()
lin_reg.fit(x_train_ploy[:n_train_two],y[:n_train_two])
y_predict = lin_reg.predict(ploy_reg.fit_transform(X[n_train_two:]))
plt.plot(X[n_train_two:],y[n_train_two:],'b.')
plt.plot(np.sort(X[n_train_two:,0]),y_predict[np.argsort(X[n_train_two:,0])],c='r')
plt.show()
lin_reg.intercept_,lin_reg.coef_

在这里插入图片描述

(4.9972612573527595, array([ 1.22743188, -4.66558282,  4.04327166]))

①最佳参数为:[4.9972612573527595, 1.22743188, -4.66558282, 4.04327166])
真实参数为:[5.0, 1.2, -3.4, 5.6]
误差很大,训练模型与真实值相差很远。

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves2(model, X_train, X_val, y_train, y_val):
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=198, random_state=0)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)   
    plt.xlabel("Training set size", fontsize=14) 
    plt.ylabel("RMSE", fontsize=14)    

from sklearn.pipeline import Pipeline

polynomial_regression = Pipeline([
        ("poly_features", PolynomialFeatures(degree=3, include_bias=False)),
        ("lin_reg", LinearRegression()),
    ])
plot_learning_curves2(polynomial_regression,X[:n_train_two],X[n_train_two:],y[:n_train_two],y[n_train_two:])
plt.show()
lin_reg.coef_,lin_reg.intercept_

在这里插入图片描述

(array([ 1.22743188, -4.66558282,  4.04327166]), 4.9972612573527595)

②上图显示模型过拟合,可以通过增大训练集,增加正则项来改进

5)10阶多项式函数模型拟合

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
ploy_reg = PolynomialFeatures(degree = 10,include_bias=False)
x_train_ploy = ploy_reg.fit_transform(X[:n_train])
lin_reg = LinearRegression()
lin_reg.fit(x_train_ploy,y[:n_train])
y_predict = lin_reg.predict(ploy_reg.fit_transform(X[n_test:]))
lin_reg.coef_,lin_reg.intercept_

(array([ 1.15790635e+00, -3.54010577e+00,  5.70843710e+00,  2.68713126e-01,
        -8.46298214e-02, -1.60528291e-01,  2.34308233e-02,  3.75767946e-02,
        -2.10616861e-03, -2.97381415e-03]), 4.990219576247277)

①最佳参数的取值为:[4.990219576247277, 1.15790635e+00, -3.54010577e+00, 5.70843710e+00, 2.68713126e-01,-8.46298214e-02, -1.60528291e-01, 2.34308233e-02, 3.75767946e-02,-2.10616861e-03, -2.97381415e-03]
真实参数为:[5.0, 1.2, -3.4, 5.6]。
误差很大,训练模型与真实值相差很远。

from sklearn.pipeline import Pipeline
polynomial_regression = Pipeline([
        ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
        ("lin_reg", LinearRegression()),
    ])
plot_learning_curves(polynomial_regression,X,y)
plt.axis([0, 80, 0, 7000])                       
plt.show() 

在这里插入图片描述

②根据上图中的学习曲线,10阶多项式函数模型过拟合。可以通过改用简单的模型来改进。

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

std_scaler = StandardScaler()
x_train_scaler = std_scaler.fit_transform(x_train_ploy)
regul_reg = Ridge()
regul_reg.fit(x_train_scaler, y[:n_train])
polynomial_regression = Pipeline([
        ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
        ("regul_reg", regul_reg),
    ])
plot_learning_curves(polynomial_regression,X,y)
plt.axis([0, 80, 0, 7000])                       
plt.show() 

在这里插入图片描述

regul_reg.coef_,regul_reg.intercept_

(array([ 2.09150783, -2.43044345,  3.14813857, -0.98764902,  1.80433369,
         0.40805287, -0.48957421, -0.08024497,  0.04289698,  0.00624638]),
 4.835916534954838)

③经过L2正则化之后,该模型任然过拟合。

④正则化之后的最佳参数比原始最佳参数略大,L2正则化一定程度上减弱了过拟合,提高了泛化性

regul_reg = Ridge(alpha = 1e-05)
regul_reg.fit(x_train_scaler, y[:n_train])
print("超参数α为1e-05时:",regul_reg.intercept_, regul_reg.coef_)
regul_reg = Ridge(alpha = 0)
regul_reg.fit(x_train_scaler, y[:n_train])
print("超参数α为0时:",regul_reg.intercept_, regul_reg.coef_)
regul_reg = Ridge(alpha = 10)
regul_reg.fit(x_train_scaler, y[:n_train])
print("超参数α为10时:",regul_reg.intercept_, regul_reg.coef_)
regul_reg = Ridge(alpha =100)
regul_reg.fit(x_train_scaler, y[:n_train])
print("超参数α为100时:",regul_reg.intercept_, regul_reg.coef_)

超参数α为1e-05时: -1.360578236656889 [ 1.19648908 -4.53568088 17.96760851  1.52336551 -1.13885604 -4.757882
  1.66330637  6.25853792 -0.84455821 -2.87640445]
超参数α为0时: -1.360578236656889 [ 1.194756   -4.54302313 17.98518876  1.58455352 -1.20412617 -4.94984298
  1.76350575  6.51350555 -0.89689257 -2.99454259]
超参数α为10时: -1.3605782366568893 [ 5.18335247 -3.08137697  7.66025297 -1.73910643  4.69274077 -0.50304537
  1.81448266  0.37587122 -0.08031236  0.96280935]
超参数α为100时: -1.3605782366568888 [ 4.22481188 -1.79165948  4.37013451 -1.16335591  3.32133205 -0.59438427
  2.36068741 -0.21399564  1.70099733  0.01452215]

⑤正则化超参数α越大,参数权值越接近0

已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页