向前逐步回归
我们在进行拟合的时候,没有必要将所有变量都引入到我们的函数之中,这种操作往往会导致过拟合,而过拟合带来的致命影响就是泛化能力差,最小二乘法估计参数的时候无法终止学习的过程。向前逐步回归的引入则可以控制学习过程中出现的过拟合,它是对最小二乘法的一种优化改进。其基本思想就是由少到多地向模型中引入变量,每次增加一个,直到没有可以引入变量为止,最后通过比较在预留样本上计算出的错误进行模型的选择。
参考连接:https://www.csdn.net/tags/NtzaUg3sNTc4NjktYmxvZwO0O0OO0O0O.html
这里我们用到的是葡萄酒测试集:下载链接如下https://archive.ics.uci.edu/ml/datasets/Wine+Quality
#导入要用到的各种包和函数
import numpy as np
import pandas as pd
from sklearn import datasets,linear_model
from math import sqrt
import matplotlib.pyplot as plt
#读入要用到的红酒数据集
wine_data = pd.read_csv('winequality-red.csv',sep=';',header=0,encoding='utf-8')
wine_data.head()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
#查看红酒数据集的统计信息
wine_data.describe()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
#定义从输入数据集中区指定列作为训练集和测试集的函数(从取1列一直到取11列)
def xattrSelect(x,idxSet):
xOut = []
for row in x:
xOut.append([row[i] for i in idxSet])
return(xOut)
xList = [] #构造用于存放属性集的列表
labels = [float(label)for label in wine_data.iloc[:,-1].tolist()]#提取wine_data中的标签值放入列表中
names = wine_data.columns.tolist() #提取出wine_data中所有属性的名称并放入列表中
for i in range (len(wine_data)):
xList.append(wine_data.iloc[i,0:-1])#列表xList中的每个元素对应着wine_data中除去标签的每一行
#将原始数据集划分成训练集占(2/3),测试集占(1/3);
indices = range(len(xList))
xListTest = [xList[i] for i in indices if i%3 == 0]
xListTrain = [xList[i] for i in indices if i%3 != 0]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]
attributeList = [] #构造用于存放属性索引的列表
index = range(len(xList[1]))#index用于下面代码中的外层for循环
indexSet =set(index) #构造由names中的所有属性对应的索引构成索引集合
oosError = []#构造用于存放下面代码中内层for循环每次结束后最小的RMSE(均方根误差)
for i in index:
attSet = set(attributeList)
attTrySet = indexSet - attSet #构造由不在attributeList中的属性索引组成的集合
attTry = [ii for ii in attTrySet] #构造由在attTryz中的属性索引组成的列表
errorList = []
attTemp = []
for iTry in attTry:
attTemp = [] + attributeList
attTemp.append(iTry)
#调用attrSelect函数从xListTrain和xListTest中选取指定的列构成暂时的训练集与测试集
xTrainTemp = xattrSelect(xListTrain, attTemp)
xTestTemp = xattrSelect(xListTest,attTemp)
#将需要用到的训练集进而测试集都转化成数组对象
xTrain = np.array(xTrainTemp)
yTrain = np.array(labelsTrain)
xTest = np.array(xTestTemp)
yTest = np.array(labelsTest)
#使用scikit包训练线性回归模型
wineQModel = linear_model.LinearRegression()
wineQModel.fit(xTrain,yTrain)
#计算在测试集上的RMSE
rmsError = np.linalg.norm((yTest-wineQModel.predict(xTest)),2)/sqrt(len(yTest))
errorList.append(rmsError)
attTemp = []
iBest = np.argmin(errorList) #选出errorList中的最小值对应的新索引
attributeList.append(attTry[iBest]) #利用新索引iBest将attTry中对应的属性索引添加到attributeListt
oosError.append(errorList[iBest]) #将errorList中的最小值添加到oosError列表中
print("Out of sample error versus attribute set size")
print(oosError)
print('\n'+'Best attribute indices')
print(attributeList)
namesList = [names[i] for i in attributeList]
print("\n"+"Best attribute names")
print(namesList)
Out of sample error versus attribute set size
[0.8162067605843373, 0.7638643857209025, 0.7638643857209025, 0.7638281646287916, 0.7570245479523805, 0.755096811807613, 0.7402464046416789, 0.7095983105997431, 0.7078841064360114, 0.673980209520692, 0.656971757677521, 0.6573909869011335]
Best attribute indices
[0, 1, 1, 3, 4, 5, 6, 7, 8, 9, 10, 2]
Best attribute names
['fixed acidity', 'volatile acidity', 'volatile acidity', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'citric acid']
# 绘制由不同数量的属性构成的线性回归模型在测试集上的RMSE与属性数量的关系图像
x = range(len(oosError))
plt.plot(x,oosError, 'k')
plt.xlabel('Number of Attributes')
plt.ylabel('Error(RMS)')
plt.show()
#绘制由最佳数量的属性构成的线性回归模型在测试集上的误差分布直方图
indexBest = oosError.index(min(oosError))
attributesBest = attributeList[1:(indexBest+1)]#attributesBests
#调用xattrSelect函数从xListTrain和xListTest中选取最佳数量的列暂时的训练集和测试集
xTrainTemp = xattrSelect(xListTrain,attributesBest)
xTestTemp = xattrSelect(xListTest,attributesBest)
xTrain = np.array(xTrainTemp)#将xTrain转化成数组对象
xTest = np.array(xTestTemp)#将xTrain转化成数组对象
#训练模型并绘制直方图
wineQModel = linear_model.LinearRegression()
wineQModel.fit(xTrain,yTrain)
errorVector = yTest-wineQModel.predict(xTest)
plt.hist(errorVector)
plt.xlabel('Bin Boundaries')
plt.ylabel('Counts')
plt.show()
#绘制红酒口感实际值与预测值之间的散点图
plt.scatter(wineQModel.predict(xTest),yTest,s=100,alpha=1)
plt.xlabel("Predict Taste Score")
plt.ylabel("Actual Taste Score")
plt.show()