python机器学习核心算法3-3

最新推荐文章于 2020-11-01 11:52:00 发布

llx1026

最新推荐文章于 2020-11-01 11:52:00 发布

阅读量810

点赞数 1

分类专栏： python机器学习-预测分析核心算法文章标签： python 机器学习

本文链接：https://blog.csdn.net/llx1026/article/details/77841608

版权

python机器学习-预测分析核心算法专栏收录该内容

10 篇文章 1 订阅

订阅专栏

import numpy
from sklearn import datasets, linear_model
from math import sqrt
import matplotlib.pyplot as plt
import urllib.request

def xattrSelect(x, idxSet):
    xOut = []
    for row in x:
        xOut.append([row[i] for i in idxSet])
    return(xOut)

target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv")
data = urllib.request.urlopen(target_url)

xList = []
labels = []
names = []
firstLine = True
for line in data:
    if firstLine:
        names = line.strip().split(";".encode(encoding='utf-8'))#winequality-red.csv数据存储的数据第一行是属性名，所以需要将其存在names列表中
        firstLine = False
    else:
        row = line.strip().split(";".encode(encoding='utf-8'))
        labels.append(float(row[-1]))#最后一列是属性值quality，将其每一行对应的值看作是分类标签labels
        row.pop()
        floatRow = [float(num) for num in row]
        xList.append(floatRow)


#划分训练集和测试集的属性值和标签值
indices = range(len(xList))#len(xList)=1599,range(1599)是从0开始，1598结束，正好与xList的索引相吻合
xListTest = [xList[i] for i in indices if i%3 == 0]
xListTrain = [xList[i] for i in indices if i%3 != 0]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]

attributeList = []
index = range(len(xList[1]))
#print(index):range(0, 11)
indexSet = set(index)
#print(indexSet):{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
indexSeq = []
oosError = []
for i in index:
    attSet = set(attributeList)
    #print(attSet)
    attTrySet = indexSet - attSet#{0, 1, ..., 10}
    attTry = [ii for ii in attTrySet]#[0, 1, ..., 10]-11*11
    errorList = []
    attTemp = []
    for iTry in attTry:
        attTemp = [] + attributeList#[] + [] = []
        attTemp.append(iTry)
        xTrainTemp = xattrSelect(xListTrain, attTemp)
        xTestTemp = xattrSelect(xListTest, attTemp)
        xTrain = numpy.array(xTrainTemp)
        yTrain = numpy.array(labelsTrain)
        xTest = numpy.array(xTestTemp)
        yTest = numpy.array(labelsTest)
        wineQModel = linear_model.LinearRegression()
        wineQModel.fit(xTrain, yTrain)
        rmsError = numpy.linalg.norm((yTest - wineQModel.predict(xTest)), 2) / sqrt(len(yTest))
        errorList.append(rmsError)
        attTemp = []
    #print(errorList)
    iBest = numpy.argmin(errorList)
    #print(iBest)
    attributeList.append(attTry[iBest])
    oosError.append(errorList[iBest])

print("Out of sample error versus attribute set size")
print(oosError)
print("\n" + "Best attribute indices")
print(attributeList)
namesList = [names[i] for i in attributeList]
print("\n" + "Best attribute names")
print(namesList)

x = range(len(oosError))
plt.plot(x, oosError, 'k')
plt.xlabel('Number of Attributes')
plt.ylabel('Error (RMS)')
plt.show()

indexBest = oosError.index(min(oosError))
attributesBest = attributeList[1:(indexBest + 1)]
xTrainTemp = xattrSelect(xListTrain, attributesBest)
xTestTemp = xattrSelect(xListTest, attributesBest)
xTrain = numpy.array(xTrainTemp)
xTest = numpy.array(xTestTemp)
wineQModel = linear_model.LinearRegression()
wineQModel.fit(xTrain, yTrain)
errorVector = yTest - wineQModel.predict(xTest)
plt.hist(errorVector)
plt.xlabel("Bin Boundaries")
plt.ylabel("Counts")
plt.show()

plt.scatter(wineQModel.predict(xTest), yTest, s=100, alpha=0.10)
plt.xlabel('Predicted Taste Score')
plt.ylabel('Actual Taste Score')
plt.show()

最后运行结果：

Out of sample error versus attribute set size
[0.72342592551162777, 0.68609931528371959, 0.67343650334202776, 0.66770332138977961, 0.66225585685222721, 0.65900047541546247, 0.6572717206143075, 0.65709058062076975, 0.65699930964461395, 0.65758189400434719, 0.65739098690113407]

Best attribute indices
[10, 1, 9, 4, 6, 8, 5, 3, 2, 7, 0]

Best attribute names
[b'"alcohol"', b'"volatile acidity"', b'"sulphates"', b'"chlorides"', b'"total sulfur dioxide"', b'"pH"', b'"free sulfur dioxide"', b'"residual sugar"', b'"citric acid"', b'"density"', b'"fixed acidity"']

效果图：

llx1026

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
3
评论
python机器学习核心算法3-3

import numpyfrom sklearn import datasets, linear_modelfrom math import sqrtimport matplotlib.pyplot as pltimport urllib.requestdef xattrSelect(x, idxSet): xOut = [] for row in x:
复制链接

扫一扫

专栏目录