import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = np.random.uniform(-3.0,3.0,size=100)
X = x.reshape(-1,1)
y = 0.5 * x ** 2 + x + 2 + np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
输出图片:
使用线性回归
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
lin_reg.score(X,y)
输出:0.4953707811865009
y_predict = lin_reg.predict(X)
plt.scatter(x,y)
plt.plot(np.sort(x),y_predict[np.argsort(x)],color='r')
plt.show()
输出图片:
from sklearn.metrics import mean_squared_error#使用均方误差来进行衡量
y_predict = lin_reg.predict(X)#我们训练出来的模型LinearRegression对于X的预测结果
mean_squared_error(y,y_predict)#使用线性拟合最终得到的均方误差
输出:3.0750025765636577
用直线来拟合数据,显然没有很好的反应原始数据的样本特征,这种情况叫做欠拟合
使用多项式回归
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
def PolymialRegression(degree):
return Pipeline([
("poly",PolynomialFeatures(degree=degree)),
("std_scaler",StandardScaler()),
("lin_reg",LinearRegression())
])
poly2_reg = PolymialRegression(degree=2)
poly2_reg.fit(X,y)
Pipeline(steps=[('poly', PolynomialFeatures()),
('std_scaler', StandardScaler()),
('lin_reg', LinearRegression())])
y2_predict = poly2_reg.predict(X)
mean_squared_error(y,y2_predict)
输出:1.0987392142417856
plt.scatter(x,y)
plt.plot(np.sort(x),y10_predict[np.argsort(x)],color='r')
plt.show()
输出图片:
将degree的值放大再来看结果:
poly100_reg = PolymialRegression(degree=100)
poly100_reg.fit(X,y)
y100_predict = poly100_reg.predict(X)
mean_squared_error(y,y100_predict)
输出:0.6870911922673567
plt.scatter(x,y)
plt.plot(np.sort(x),y100_predict[np.argsort(x)],color='r')
plt.show()
输出图片:
X_plot = np.linspace(-3,3,100).reshape(100,1)
y_plot = poly100_reg.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
plt.axis([-3,3,-1,10])
plt.show()
输出结果:
虽然均方误差变得越来越小,但是该曲线为了拟合我们给定的所有的样本点而变得太过复杂了,这种情况叫做过拟合