【岭回归原理】为避免过拟合,岭回归模型保留所有特征变量,但是会减小特征变量系数值(通过改变alpha参数来控制减小特征变量系数的程度),让特征变量对预测结果的影响变小。其实是L2正则化。其泛化能力较好。scikit-learn中调用函数sklearn.linear_model.Ridge来实现岭回归。
#导入数据集拆分工具
from sklearn.model_selection import train_test_split
#使用含噪声的数据
from sklearn.datasets import load_diabetes#糖尿病数据集
X,y=load_diabetes().data,load_diabetes().target
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=8)
#导入岭回归
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train, y_train)
print("岭回归训练数据集得分:{:.2f}".format(ridge.score(X_train, y_train)))
print("岭回归测试数据集得分:{:.2f}".format(ridge.score(X_test, y_test)))
#岭回归调参
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("\n岭回归训练数据集得分:{:.2f}".format(ridge10.score(X_train, y_train)))
print("岭回归测试数据集得分:{:.2f}".format(ridge10.score(X_test, y_test)))
ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("\n岭回归训练数据集得分:{:.2f}".format(ridge01.score(X_train, y_train)))
print("岭回归测试数据集得分:{:.2f}".format(ridge01.score(X_test, y_test)))
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)
#alpha影响coef_属性
import matplotlib.pyplot as plt
plt.plot(ridge.coef_, 's', label = 'Ridge alpha=1')#square marker
plt.plot(ridge10.coef_, '^', label = 'Ridge alpha=10')#triangle_up marker
plt.plot(ridge01.coef_, 'v', label = 'Ridge alpha=0.1')#triangle_down marker
plt.plot(lr.coef_, 'o', label = 'linear regression')#circle marker
plt.xlabel("coefficient index")
plt.ylabel("coefficient magnitude")
plt.hlines(0,0, len(lr.coef_))
#plt.ylim(-25,25)
plt.legend(loc='best')
plt.show()
#改变训练数据集的数据量,得模型评分折线图
from sklearn.model_selection import learning_curve,KFold
#定义绘制学习曲线的函数
def plot_learning_curve(est,X,y):
training_set_size,train_scores,test_scores=learning_curve(est,X,y,train_sizes=np.linspace(.1,1,20),cv=KFold(20,shuffle=True,random_state=1))
estimator_name=est.__class__.__name__
line = plt.plot(training_set_size, train_scores.mean(axis=1), '--',label="training " + estimator_name)
plt.plot(training_set_size, test_scores.mean(axis=1),'-',label="test " + estimator_name, c=line[0].get_color())
plt.xlabel('Training set size')
plt.ylabel('Score')
plt.ylim(0, 1.1)
plot_learning_curve(Ridge(alpha=1), X, y)
plot_learning_curve(LinearRegression(), X, y)
plt.legend(loc=(0, 1.05), ncol=2, fontsize=11)
plt.show()