python机器学习核心算法3-3

import numpy
from sklearn import datasets, linear_model
from math import sqrt
import matplotlib.pyplot as plt
import urllib.request

def xattrSelect(x, idxSet):
    xOut = []
    for row in x:
        xOut.append([row[i] for i in idxSet])
    return(xOut)

target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv")
data = urllib.request.urlopen(target_url)

xList = []
labels = []
names = []
firstLine = True
for line in data:
    if firstLine:
        names = line.strip().split(";".encode(encoding='utf-8'))#winequality-red.csv数据存储的数据第一行是属性名,所以需要将其存在names列表中
        firstLine = False
    else:
        row = line.strip().split(";".encode(encoding='utf-8'))
        labels.append(float(row[-1]))#最后一列是属性值quality,将其每一行对应的值看作是分类标签labels
        row.pop()
        floatRow = [float(num) for num in row]
        xList.append(floatRow)


#划分训练集和测试集的属性值和标签值
indices = range(len(xList))#len(xList)=1599,range(1599)是从0开始,1598结束,正好与xList的索引相吻合
xListTest = [xList[i] for i in indices if i%3 == 0]
xListTrain = [xList[i] for i in indices if i%3 != 0]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]

attributeList = []
index = range(len(xList[1]))
#print(index):range(0, 11)
indexSet = set(index)
#print(indexSet):{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
indexSeq = []
oosError = []
for i in index:
    attSet = set(attributeList)
    #print(attSet)
    attTrySet = indexSet - attSet#{0, 1, ..., 10}
    attTry = [ii for ii in attTrySet]#[0, 1, ..., 10]-11*11
    errorList = []
    attTemp = []
    for iTry in attTry:
        attTemp = [] + attributeList#[] + [] = []
        attTemp.append(iTry)
        xTrainTemp = xattrSelect(xListTrain, attTemp)
        xTestTemp = xattrSelect(xListTest, attTemp)
        xTrain = numpy.array(xTrainTemp)
        yTrain = numpy.array(labelsTrain)
        xTest = numpy.array(xTestTemp)
        yTest = numpy.array(labelsTest)
        wineQModel = linear_model.LinearRegression()
        wineQModel.fit(xTrain, yTrain)
        rmsError = numpy.linalg.norm((yTest - wineQModel.predict(xTest)), 2) / sqrt(len(yTest))
        errorList.append(rmsError)
        attTemp = []
    #print(errorList)
    iBest = numpy.argmin(errorList)
    #print(iBest)
    attributeList.append(attTry[iBest])
    oosError.append(errorList[iBest])

print("Out of sample error versus attribute set size")
print(oosError)
print("\n" + "Best attribute indices")
print(attributeList)
namesList = [names[i] for i in attributeList]
print("\n" + "Best attribute names")
print(namesList)

x = range(len(oosError))
plt.plot(x, oosError, 'k')
plt.xlabel('Number of Attributes')
plt.ylabel('Error (RMS)')
plt.show()

indexBest = oosError.index(min(oosError))
attributesBest = attributeList[1:(indexBest + 1)]
xTrainTemp = xattrSelect(xListTrain, attributesBest)
xTestTemp = xattrSelect(xListTest, attributesBest)
xTrain = numpy.array(xTrainTemp)
xTest = numpy.array(xTestTemp)
wineQModel = linear_model.LinearRegression()
wineQModel.fit(xTrain, yTrain)
errorVector = yTest - wineQModel.predict(xTest)
plt.hist(errorVector)
plt.xlabel("Bin Boundaries")
plt.ylabel("Counts")
plt.show()

plt.scatter(wineQModel.predict(xTest), yTest, s=100, alpha=0.10)
plt.xlabel('Predicted Taste Score')
plt.ylabel('Actual Taste Score')
plt.show()

最后运行结果:

Out of sample error versus attribute set size
[0.72342592551162777, 0.68609931528371959, 0.67343650334202776, 0.66770332138977961, 0.66225585685222721, 0.65900047541546247, 0.6572717206143075, 0.65709058062076975, 0.65699930964461395, 0.65758189400434719, 0.65739098690113407]

Best attribute indices
[10, 1, 9, 4, 6, 8, 5, 3, 2, 7, 0]

Best attribute names
[b'"alcohol"', b'"volatile acidity"', b'"sulphates"', b'"chlorides"', b'"total sulfur dioxide"', b'"pH"', b'"free sulfur dioxide"', b'"residual sugar"', b'"citric acid"', b'"density"', b'"fixed acidity"']
效果图:

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值