第8章上多项式回归与模型泛化

CaiGbro

已于 2022-11-05 19:51:35 修改

阅读量346

点赞数 1

分类专栏：机器学习笔记文章标签：回归 python 人工智能

于 2022-10-29 16:34:48 首次发布

本文链接：https://blog.csdn.net/weixin_52449030/article/details/127588711

版权

机器学习笔记专栏收录该内容

17 篇文章 1 订阅

订阅专栏

8-1 什么是多项式回归

Notbook 示例

Notbook 源码

[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1,1)
[3]
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
[4]
plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x1c6b9056910>

[5]
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X,y)
LinearRegression()
[6]
y_predict = lin_reg.predict(X)
[7]
plt.scatter(X, y)
plt.plot(x, y_predict, color='r')
[<matplotlib.lines.Line2D at 0x1c6bba7a4c0>]

解决方案， 添加一个特征
[8]
(X**2).shape
(100, 1)
[9]
X2 = np.hstack([X,X**2])
[10]
X2.shape
(100, 2)
[11]
lin_reg2 = LinearRegression()
lin_reg2.fit(X2,y)
y_predict2 = lin_reg2.predict(X2)
[12]
plt.scatter(X, y)
plt.plot(x, y_predict2, color='r')
[<matplotlib.lines.Line2D at 0x1c6bbaeaaf0>]

[13]
plt.scatter(X, y)
plt.plot(np.sort(x), y_predict2[np.argsort(x)], color='r')
[<matplotlib.lines.Line2D at 0x1c6bbb5d580>]

[14]
lin_reg2.coef_
array([1.20691609, 0.45458127])
[15]
lin_reg2.intercept_
2.0128010763968693

8-2 scikit-learn中的多项式回归于pipeline

Notbook 示例

Notbook 源码

scikit-learn中的多项式回归和 Pipeline
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1,1)
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
[3]
from sklearn.preprocessing import PolynomialFeatures
[5]
poly = PolynomialFeatures(degree=2)
poly.fit(X)
X2  = poly.transform(X)
[6]
X2.shape
(100, 3)
[7]
X2[:5,:]
array([[ 1.00000000e+00,  2.37510398e+00,  5.64111892e+00],
       [ 1.00000000e+00, -1.81116137e+00,  3.28030550e+00],
       [ 1.00000000e+00,  5.98057818e-01,  3.57673154e-01],
       [ 1.00000000e+00, -2.30435091e+00,  5.31003312e+00],
       [ 1.00000000e+00,  6.89603539e-02,  4.75553041e-03]])
[8]
X[:5,:]
array([[ 2.37510398],
       [-1.81116137],
       [ 0.59805782],
       [-2.30435091],
       [ 0.06896035]])
[9]
from sklearn.linear_model import LinearRegression

lin_reg2 = LinearRegression()
lin_reg2.fit(X2,y)
y_predict2 = lin_reg2.predict(X2)
[10]
plt.scatter(X, y)
plt.plot(np.sort(x), y_predict2[np.argsort(x)], color='r')
[<matplotlib.lines.Line2D at 0x21108349be0>]

[11]
lin_reg2.coef_
array([0.        , 0.98483836, 0.452626  ])
[12]
lin_reg2.intercept_
2.0254219063569225
关于PolynomialFeatures
[13]
X = np.arange(1, 11).reshape(-1, 2)
[14]
X.shape
(5, 2)
[15]
X
array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10]])
[16]
poly = PolynomialFeatures(degree=2)
poly.fit(X)
X2  = poly.transform(X)
[17]
X2.shape
(5, 6)
[18]
X2
array([[  1.,   1.,   2.,   1.,   2.,   4.],
       [  1.,   3.,   4.,   9.,  12.,  16.],
       [  1.,   5.,   6.,  25.,  30.,  36.],
       [  1.,   7.,   8.,  49.,  56.,  64.],
       [  1.,   9.,  10.,  81.,  90., 100.]])
[19]
poly = PolynomialFeatures(degree=3)
poly.fit(X)
X3  = poly.transform(X)
[20]
X3.shape
(5, 10)
[21]
X3
array([[   1.,    1.,    2.,    1.,    2.,    4.,    1.,    2.,    4.,
           8.],
       [   1.,    3.,    4.,    9.,   12.,   16.,   27.,   36.,   48.,
          64.],
       [   1.,    5.,    6.,   25.,   30.,   36.,  125.,  150.,  180.,
         216.],
       [   1.,    7.,    8.,   49.,   56.,   64.,  343.,  392.,  448.,
         512.],
       [   1.,    9.,   10.,   81.,   90.,  100.,  729.,  810.,  900.,
        1000.]])
Pipeline
[22]
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1,1)
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
[23]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly_reg = Pipeline([
    ("poly",PolynomialFeatures(degree=2)),
    ("std_scaler",StandardScaler()),
    ("lin_reg",LinearRegression())
])
[24]
poly_reg.fit(X,y)
y_predict = poly_reg.predict(X)
[25]
plt.scatter(X, y)
plt.plot(np.sort(x), y_predict[np.argsort(x)], color='r')
[<matplotlib.lines.Line2D at 0x2110845de80>]

8-3 过拟合与欠拟合

Notbook 示例

Notbook 源码

[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1,1)
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
[3]
plt.scatter(X, y)
<matplotlib.collections.PathCollection at 0x2a05e646850>

[4]
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X,y)
lin_reg.score(X,y)
0.40801505133573723
[5]
y_predict = lin_reg.predict(X)
plt.scatter(X, y)
plt.plot(x, y_predict, color='r')
[<matplotlib.lines.Line2D at 0x2a061057bb0>]

[6]
from sklearn.metrics import mean_squared_error

y_predict = lin_reg.predict(X)
mean_squared_error(y,y_predict)
2.9187245186308566
使用多项式回归
[7]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

def PolynomialRegression(degree):
    return Pipeline([
    ("poly",PolynomialFeatures(degree=degree)),#!!!!!!!!!
    ("std_scaler",StandardScaler()),
    ("lin_reg",LinearRegression())
])
[8]
poly2_reg = PolynomialRegression(degree=2)
poly2_reg.fit(X,y)
Pipeline(steps=[('poly', PolynomialFeatures()),
                ('std_scaler', StandardScaler()),
                ('lin_reg', LinearRegression())])
[9]
y2_predict = poly2_reg.predict(X)
mean_squared_error(y,y2_predict)
0.8399501409780341
[10]
plt.scatter(X, y)
plt.plot(np.sort(x), y2_predict[np.argsort(x)], color='r')
[<matplotlib.lines.Line2D at 0x2a0610f9130>]

[11]
poly10_reg = PolynomialRegression(degree=10)
poly10_reg.fit(X,y)

y10_predict = poly10_reg.predict(X)
mean_squared_error(y,y10_predict)
0.7755990046397335
[12]
plt.scatter(X, y)
plt.plot(np.sort(x), y10_predict[np.argsort(x)], color='r')
[<matplotlib.lines.Line2D at 0x2a061166040>]

[13]
poly100_reg = PolynomialRegression(degree=100)
poly100_reg.fit(X,y)

y100_predict = poly100_reg.predict(X)
mean_squared_error(y,y100_predict)
0.46706858760890346
[14]
plt.scatter(X, y)
plt.plot(np.sort(x), y100_predict[np.argsort(x)], color='r')
[<matplotlib.lines.Line2D at 0x2a0611bde50>]

[15]
X_plot = np.linspace(-3, 3, 100).reshape(100,1)
y_plot = poly100_reg.predict(X_plot)

plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
[<matplotlib.lines.Line2D at 0x2a06122bfa0>]

[16]
X_plot = np.linspace(-3, 3, 100).reshape(100,1)
y_plot = poly100_reg.predict(X_plot)
plt.axis([-3, 3, -1,10])
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
[<matplotlib.lines.Line2D at 0x2a06129fa00>]

8-4 为什么要训练数据集与测试数据集

Notbook 示例

Notbook 源码

[1]
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
[2]
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1,1)
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
[3]
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

def PolynomialRegression(degree):
    return Pipeline([
    ("poly",PolynomialFeatures(degree=degree)),#!!!!!!!!!
    ("std_scaler",StandardScaler()),
    ("lin_reg",LinearRegression())
])
[4]
poly100_reg = PolynomialRegression(degree=100)
poly100_reg.fit(X,y)

y100_predict = poly100_reg.predict(X)
mean_squared_error(y,y100_predict)
0.545801916826715
[5]
X_plot = np.linspace(-3, 3, 100).reshape(100,1)
y_plot = poly100_reg.predict(X_plot)
plt.axis([-3, 3, -1,10])
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
[<matplotlib.lines.Line2D at 0x264b1a80220>]

train test split 的意义
[18]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
[19]
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
y_predict = lin_reg.predict(X_test)
mean_squared_error(y_test,y_predict)
2.0016192761256106
[20]
poly2_reg = PolynomialRegression(degree=2)
poly2_reg.fit(X_train,y_train)

y2_predict = poly2_reg.predict(X_test)
mean_squared_error(y_test,y2_predict)
0.8234102979179919
[21]
poly10_reg = PolynomialRegression(degree=10)
poly10_reg.fit(X_train,y_train)

y10_predict = poly10_reg.predict(X_test)
mean_squared_error(y_test,y10_predict)
0.9220845817062795
[22]
poly100_reg = PolynomialRegression(degree=100)
poly100_reg.fit(X_train,y_train)

y100_predict = poly100_reg.predict(X_test)
mean_squared_error(y_test,y100_predict)
6.291390102640921e+20

8-5 学习曲线

Notbook 示例

Notbook 源码

学习曲线
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
np.random.seed(666)
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1,1)
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
[3]
plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x28218d66820>

学习曲线
[4]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
[5]
X_train.shape # seed 不影响shape
(75, 1)
[6]
X_test.shape
(25, 1)
[7]
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

train_score = []
test_score = []
for i in range(1,76):
    lin_reg = LinearRegression()
    lin_reg.fit(X_train[:i],y_train[:i])
    
    y_train_predict = lin_reg.predict(X_train[:i])
    train_score.append(mean_squared_error(y_train[:i],y_train_predict))
    
    y_test_predict = lin_reg.predict(X_test)
    test_score.append(mean_squared_error(y_test,y_test_predict))
[8]
plt.plot([i for i in range(1,76)],np.sqrt(train_score),label="train")
plt.plot([i for i in range(1,76)],np.sqrt(test_score),label="test")
plt.legend()
<matplotlib.legend.Legend at 0x2821b77ae50>

[13]
def plot_learning_curve(algo, X_train, X_test, y_train, y_test):
    train_score = []
    test_score = []
    for i in range(1,len(X_train)+1):
        algo.fit(X_train[:i],y_train[:i])
       

        y_train_predict = algo.predict(X_train[:i])
        train_score.append(mean_squared_error(y_train[:i],y_train_predict))

        y_test_predict = algo.predict(X_test)
        test_score.append(mean_squared_error(y_test,y_test_predict))
        
    plt.plot([i for i in range(1, len(X_train)+1)],
                                np.sqrt(train_score), label="train")
    plt.plot([i for i in range(1, len(X_train)+1)],
                                np.sqrt(test_score), label="test")
    plt.legend()
    plt.axis([0, len(X_train)+1, 0,4])
plot_learning_curve(LinearRegression(), X_train,X_test,y_train,y_test)

[14]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

def PolynomialRegression(degree):
    return Pipeline([
    ("poly",PolynomialFeatures(degree=degree)),#!!!!!!!!!
    ("std_scaler",StandardScaler()),
    ("lin_reg",LinearRegression())
])
[15]
poly2_reg = PolynomialRegression(degree=2)
plot_learning_curve(poly2_reg,X_train, X_test, y_train, y_test)

[16]
poly2_reg = PolynomialRegression(degree=20)
plot_learning_curve(poly2_reg,X_train, X_test, y_train, y_test)