机器学习的主要目的还是分类和回归。笔者认为分类问题可以进行回归处理,通过对每个类别标签设置实数值(+1,0),进行回归预测,并设定阈值以判断出合适的类别(这里涉及到一个真正/真负/假正/假负,并绘制ROC分类曲线问题)。因此此处我们就可以将惩罚线性问题推广到实际应用中。
惩罚线性回归问题,笔者已前期介绍过,实现最小均方差的同时,引入惩罚项。
惩罚线性回归模型下的回归预测问题
在上节中,我们采用自己实现的LARS算法,实现求解惩罚线性回归问题。而在python里面有直接的安装包实现lasso(套索)回归型函数。
以下代码采用10折交叉验证的方法,程序在10份数的每一份上绘制错误随alpha(惩罚项的权重因子)变化的曲线,同时绘制在10份数据上的错误平均值。值得一提的是,交叉验证我们认为有两种使用:1、用于评估选择模型的性能;2、用于遍历选择模型中的某些最优参数值。此处交叉验证的使用目的为第2种。
__author__ = 'mike-bowles'
import numpy
from sklearn import datasets, linear_model
from sklearn.linear_model import LassoCV
from math import sqrt
import matplotlib.pyplot as plot
import csv
#read data
file='winequality-red.csv'
##with open(file,'r') as f:
## for i in f.readlines():
## print(i)
xList = []
labels = []
names = []
firstLine = True
with open(file,'r') as f:
for line in f.readlines():
if firstLine:
names = line.strip().split(";")
## print(names)
firstLine = False
else:
#split on semi-colon
row = line.strip().split(";")
## print(row)
#put labels in separate array
labels.append(float(row[-1]))
#remove label from row
row.pop()
#convert row to floats
floatRow = [float(num) for num in row]
xList.append(floatRow)
##print(labels)
##print(xList)
#Normalize columns in x and labels
#Note: be careful about normalization. Some penalized regression packages include it
#and some don't.
nrows = len(xList)
ncols = len(xList[0])
#calculate means and variances
xMeans = []
xSD = []
for i in range(ncols):
col = [xList[j][i] for j in range(nrows)]
mean = sum(col)/nrows
xMeans.append(mean)
colDiff = [(xList[j][i] - mean) for j in range(nrows)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
stdDev = sqrt(sumSq/nrows)
xSD.append(stdDev)
#use calculate mean and standard deviation to normalize xList
xNormalized = []
for i in range(nrows):
rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
xNormalized.append(rowNormalized)
#Normalize labels
meanLabel = sum(labels)/nrows
sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
#Convert list of list to np array for input to sklearn packages
#Unnormalized labels
Y = numpy.array(labels)
#normalized lables
Y = numpy.array(labelNormalized)
#Unnormalized X's
X = numpy.array(xList)
#Normlized Xss
X = numpy.array(xNormalized)
#Call LassoCV from sklearn.linear_model
wineModel = LassoCV(cv=10).fit(X, Y)#直接调用10折交叉验证方法,获得最佳的alpha值
# Display results
plot.figure()
plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')
plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),
label='Average MSE Across Folds', linewidth=2)#设置的alpha变化区域
plot.axvline(wineModel.alpha_, linestyle='--',
label='CV Estimate of Best alpha')#最佳的alpha值
plot.semilogx()
plot.legend()
ax = plot.gca()
ax.invert_xaxis()
plot.xlabel('alpha')
plot.ylabel('Mean Square Error')
plot.axis('tight')
plot.show()
#print out the value of alpha that minimizes the Cv-error
print("alpha Value that Minimizes CV Error ",wineModel.alpha_)
print("Minimum MSE ", min(wineModel.mse_path_.mean(axis=-1)))
在代码中提供了对归一化和非归一化影响的分析,一般推荐归一化处理。至少在此处我们得到了一个最优的alpha值,print("Best Coefficient Values ", wineModel.coefs)得到alpha对应的特征参数,也可以根据glmnet方法得到该alpha对应的特征参数,用于实现预测分析。另外,他也提供wineModel.predict(X)方法实现预测。
这是采用交叉验证的方法,当然也可以用整个数据集的方法,不在对各个alphas对应的MSE求和平均那就相当于只有一条线(alphas-MSE)了,实现惩罚线性回归分析。具体代码见附录。
惩罚线性回归模型下的回归分类问题
直接对多分类问题进行分析,因为前面提过分类问题本质上可以过度到回归问题。9属性-6分类。
from math import sqrt, fabs, exp
import matplotlib.pyplot as plot
from sklearn.linear_model import enet_path
from sklearn.metrics import roc_auc_score, roc_curve
import numpy
file = "glass.data"
#arrange data into list for labels and list of lists for attributes
xList = []
with open(file,'r') as f:
for line in f.readlines():
#split on comma
row = line.strip().split(",")
xList.append(row)
names = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']
#Separate attributes and labels
xNum = []
labels = []
for row in xList:
labels.append(row.pop())
l = len(row)
#eliminate ID
attrRow = [float(row[i]) for i in range(1, l)]
xNum.append(attrRow)
#number of rows and columns in x matrix
nrow = len(xNum)
ncol = len(xNum[1])
#creat one versus all label vectors
#get distinct glass types and assign index to each
yOneVAll = []
labelSet = set(labels)
labelList = list(labelSet)
labelList.sort()
nlabels = len(labelList)
for i in range(nrow):
yRow = [0.0]*nlabels
index = labelList.index(labels[i])
yRow[index] = 1.0
yOneVAll.append(yRow)
#calculate means and variances
xMeans = []
xSD = []
for i in range(ncol):
col = [xNum[j][i] for j in range(nrow)]
mean = sum(col)/nrow
xMeans.append(mean)
colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
stdDev = sqrt(sumSq/nrow)
xSD.append(stdDev)
#use calculate mean and standard deviation to normalize xNum
xNormalized = []
for i in range(nrow):
rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
xNormalized.append(rowNormalized)
#normalize y's to center
yMeans = []
ySD = []
for i in range(nlabels):
col = [yOneVAll[j][i] for j in range(nrow)]
mean = sum(col)/nrow
yMeans.append(mean)
colDiff = [(yOneVAll[j][i] - mean) for j in range(nrow)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
stdDev = sqrt(sumSq/nrow)
ySD.append(stdDev)
yNormalized = []
for i in range(nrow):
rowNormalized = [(yOneVAll[i][j] - yMeans[j])/ySD[j] for j in range(nlabels)]
yNormalized.append(rowNormalized)
#number of cross validation folds
nxval = 10
nAlphas=100
misClass = [0.0] * nAlphas
for ixval in range(nxval):
#Define test and training index sets
idxTest = [a for a in range(nrow) if a%nxval == ixval%nxval]
idxTrain = [a for a in range(nrow) if a%nxval != ixval%nxval]
#Define test and training attribute and label sets
xTrain = numpy.array([xNormalized[r] for r in idxTrain])
xTest = numpy.array([xNormalized[r] for r in idxTest])
yTrain = [yNormalized[r] for r in idxTrain]
yTest = [yNormalized[r] for r in idxTest]
labelsTest = [labels[r] for r in idxTest]
#build model for each column in yTrain
models = []
lenTrain = len(yTrain)
lenTest = nrow - lenTrain
for iModel in range(nlabels):
yTemp = numpy.array([yTrain[j][iModel] for j in range(lenTrain)])
models.append(enet_path(xTrain, yTemp,l1_ratio=1.0, fit_intercept=False, eps=0.5e-3, n_alphas=nAlphas , return_models=False))
for iStep in range(1,nAlphas):
#Assemble the predictions for all the models, find largest prediction and calc error
allPredictions = []
for iModel in range(nlabels):
_, coefs, _ = models[iModel]
predTemp = list(numpy.dot(xTest, coefs[:,iStep]))
#un-normalize the prediction for comparison
predUnNorm = [(predTemp[j]*ySD[iModel] + yMeans[iModel]) for j in range(len(predTemp))]
allPredictions.append(predUnNorm)
predictions = []
for i in range(lenTest):
listOfPredictions = [allPredictions[j][i] for j in range(nlabels) ]
idxMax = listOfPredictions.index(max(listOfPredictions))
if labelList[idxMax] != labelsTest[i]:
misClass[iStep] += 1.0
misClassPlot = [misClass[i]/nrow for i in range(1, nAlphas)]
plot.plot(misClassPlot)
plot.xlabel("Penalty Parameter Steps")
plot.ylabel(("Misclassification Error Rate"))
plot.show()
附录:
__author__ = 'mike-bowles'
import numpy
from sklearn import datasets, linear_model
from sklearn.linear_model import LassoCV
from math import sqrt
import matplotlib.pyplot as plot
import csv
#read data
file='winequality-red.csv'
##with open(file,'r') as f:
## for i in f.readlines():
## print(i)
xList = []
labels = []
names = []
firstLine = True
with open(file,'r') as f:
for line in f.readlines():
if firstLine:
names = line.strip().split(";")
## print(names)
firstLine = False
else:
#split on semi-colon
row = line.strip().split(";")
## print(row)
#put labels in separate array
labels.append(float(row[-1]))
#remove label from row
row.pop()
#convert row to floats
floatRow = [float(num) for num in row]
xList.append(floatRow)
##print(labels)
##print(xList)
#Normalize columns in x and labels
#Note: be careful about normalization. Some penalized regression packages include it
#and some don't.
nrows = len(xList)
ncols = len(xList[0])
#calculate means and variances
xMeans = []
xSD = []
for i in range(ncols):
col = [xList[j][i] for j in range(nrows)]
mean = sum(col)/nrows
xMeans.append(mean)
colDiff = [(xList[j][i] - mean) for j in range(nrows)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
stdDev = sqrt(sumSq/nrows)
xSD.append(stdDev)
#use calculate mean and standard deviation to normalize xList
xNormalized = []
for i in range(nrows):
rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
xNormalized.append(rowNormalized)
#Normalize labels
meanLabel = sum(labels)/nrows
sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
#Convert list of list to np array for input to sklearn packages
#Unnormalized labels
Y = numpy.array(labels)
#normalized lables
Y = numpy.array(labelNormalized)
#Unnormalized X's
X = numpy.array(xList)
#Normlized Xss
X = numpy.array(xNormalized)
alphas, coefs, _ = linear_model.lasso_path(X, Y, return_models=False)
plot.plot(alphas,coefs.T)
plot.xlabel('alpha')
plot.ylabel('Coefficients')
plot.axis('tight')
plot.semilogx()
ax = plot.gca()
ax.invert_xaxis()
plot.show()
nattr, nalpha = coefs.shape
#find coefficient ordering
nzList = []
for iAlpha in range(1,nalpha):
coefList = list(coefs[: ,iAlpha])
nzCoef = [index for index in range(nattr) if coefList[index] != 0.0]
for q in nzCoef:
if not(q in nzList):
nzList.append(q)
nameList = [names[nzList[i]] for i in range(len(nzList))]
print("Attributes Ordered by How Early They Enter the Model", nameList)
#find coefficients corresponding to best alpha value. alpha value corresponding to
#normalized X and normalized Y is 0.013561387700964642
alphaStar = 0.013561387700964642
indexLTalphaStar = [index for index in range(100) if alphas[index] > alphaStar]
indexStar = max(indexLTalphaStar)
#here's the set of coefficients to deploy
coefStar = list(coefs[:,indexStar])
print("Best Coefficient Values ", coefStar)
#The coefficients on normalized attributes give another slightly different ordering
absCoef = [abs(a) for a in coefStar]
#sort by magnitude
coefSorted = sorted(absCoef, reverse=True)
idxCoefSize = [absCoef.index(a) for a in coefSorted if not(a == 0.0)]
namesList2 = [names[idxCoefSize[i]] for i in range(len(idxCoefSize))]
print("Attributes Ordered by Coef Size at Optimum alpha", namesList2)