首先生成相应的数据集(X,Y),然后用线性回归模型去拟合数据集。
这里使用sklearn中的学习曲线函数learning_curve,对于回归问题返回的score是曲线拟合程度指标 R2 R 2 ,其最大值为1。 R2 R 2 的值越接近1,说明回归直线对观测值的拟合程度越好。
这里的Y=np.sqrt(X),使用一次多项式特征会欠拟合,使用3次多项式特征恰好拟合,使用10次多项式特征会过拟合,拟合效果如绘制的学习曲线所示。
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
n_dots = 200
X = np.linspace(0, 1, n_dots)
y = np.sqrt(X) + 0.2*np.random.rand(n_dots) - 0.1;
X = X.reshape(-1, 1)
y = y.reshape(-1, 1)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
def polynomial_model(degree=1):
# 生成特征x 的多项式特征。假设有a,b两个特征,那么它的2次多项式为(1,a,b,a^2,ab, b^2)
polynomial_features = PolynomialFeatures(degree=degree,
include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
return pipeline
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
train_sizes=np.linspace(.1, 1.0, 5)=array([ 0.1 , 0.325, 0.55 , 0.775, 1. ]),表示训练样本从总数据集中分别取出10%、32.5%、
55%、77.5%、100%作为子数据集。对当前子数据集,根据cv定好的规则划分为训练集和测试集,然后使用estimator指定的模型计算测试集得分。
现在是回归问题,因此函数 learning_curve 的score是MSE
"""
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
# train_scores.shape=(5,10),test_scores.shape=(5,10),共有5个子数据集。每个子数据集根据cv随机划分出10组不同的(训练集+测试集)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o--', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
# 随机选择20%的数据集作为测试集,80%作为训练集。然后重复10次,共选出10份(训练数据+测试数据)
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
titles = ['Learning Curves (Under Fitting)',
'Learning Curves',
'Learning Curves (Over Fitting)']
degrees = [1, 3, 10]
plt.figure(figsize=(18, 4), dpi=200)
for i in range(len(degrees)):
plt.subplot(1, 3, i + 1)
plot_learning_curve(polynomial_model(degrees[i]), titles[i], X, y, ylim=(0.75, 1.01), cv=cv)
plt.show()