python机器学习4—2代码详解及修改

import urllib.request
import numpy
from sklearn import datasets, linear_model
from math import sqrt
import matplotlib.pyplot as plot

#从网页中读取数据
target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv")
data = urllib.request.urlopen(target_url)


#将数据中第一行的属性读取出来放在names列表中,将其他行的数组读入row中,并将row中最后一列提取
#出来放在labels中作为标签,并使用pop将该列从row去去除掉,最后将剩下的属性值转化为float类型存入xList中
xList = []
labels = []
names = []
firstLine = True
for line in data:
    if firstLine == True:
        names = line.strip().split(";".encode(encoding='utf-8'))
        firstLine = False
    else:
        row = line.strip().split(";".encode(encoding='utf-8'))
        labels.append(float(row[-1]))
        row.pop()
        floatRow = [float(num) for num in row]
        xList.append(floatRow)
        

#以下程序主要是求解每一列属性的平均值和标准差
nrows = len(xList)
ncols = len(xList[0])

xMeans = []
xSD = []
for i in range(ncols):
    col = [xList[j][i] for j in range(nrows)]
    mean = sum(col) / nrows
    xMeans.append(mean)
    colDiff = [(xList[j][i] - mean) for j in range(nrows)]
    sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
    stdDev = sqrt(sumSq / nrows)
    xSD.append(stdDev)

#通过上面程序求解的每一列属性的均值和标准差,接下来就可以对xList中的每一个元素进行归一化
xNormalized = []
for i in range(nrows):
    rowNormalized = [(xList[i][j] - xMeans[j]) / xSD[j] for j in range(ncols)]
    xNormalized.append(rowNormalized)

#同样需要对标签中的数值进行标准化
meanLabel = sum(labels) / nrows
sdLabel = sqrt(sum((labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)) / nrows)
labelNormalized = [(labels[i] - meanLabel) / sdLabel for i in range(nrows)]

#各种初始化
nxval = 10
nSteps = 350
stepSize = 0.004

errors = []
for i in range(nSteps):
    b = []
    errors.append(b)

#进行10折交叉验证
for ixval in range(nxval):
    idxTest = [a for a in range(nrows) if a%nxval == ixval]#原代码是这样的idxTest = [a for a in range(nrows) if a%nxval == ixval*nxval]
    #这样只能进行反复的1折验证,后来我改进了代码,使其可以真正做到10折交叉验证
    idxTrain = [a for a in range(nrows) if a%nxval != ixval]#同上

    xTrain = [xNormalized[r] for r in idxTrain]
    xTest = [xNormalized[r] for r in idxTest]
    labelTrain = [labelNormalized[r] for r in idxTrain]
    labelTest = [labelNormalized[r] for r in idxTest]

    nrowsTrain = len(idxTrain)
    nrowsTest = len(idxTest)

    beta = [0.0] * ncols
    betaMat = []
    betaMat.append(list(beta))


    #进行350步的迭代,在每一次的交叉验证中都会进行350次的迭代,所以以下代码都将放在10折的交叉验证循环中,这样就可以得到10次交叉验证的最小平方误差和效果图
    for iStep in range(nSteps):
        residuals = [0.0] * nrows
        for j in range(nrowsTrain):
            labelsHat = sum([xTrain[j][k] * beta[k] for k in range(ncols)])
            residuals[j] = labelTrain[j] - labelsHat

        corr = [0.0] * ncols

        for j in range(ncols):
            corr[j] = sum([xTrain[k][j] * residuals[k] for k in range(nrowsTrain)]) / nrowsTrain#每一列属性的每一行元素与残差中对应的行元素相乘,然后得到属性的一列元素将其相加除以这列元素的行数
            #最终得到的结果即为属性与残差的相关性,有j列属性,就得到j列相关性的值[ , , , , ]

        iStar = 0
        corrStar = corr[0]

        for j in range(1, (ncols)):
            if abs(corrStar) < abs(corr[j]):
                iStar = j
                corrStar = corr[j]

        beta[iStar] += stepSize * corrStar / abs(corrStar)
        betaMat.append(list(beta))

        for j in range(nrowsTest):
            labelsHat = sum([xTest[j][k] * beta[k] for k in range(ncols)])
            err = labelTest[j] - labelsHat
            errors[iStep].append(err)

    #print("errors= ", errors)            
    cvCurve = []
    for errVect in errors:
        mse = sum([x*x for x in errVect]) / len(errVect)
        cvCurve.append(mse)

    mineMse = min(cvCurve)
    minPt = [i for i in range(len(cvCurve)) if cvCurve[i] == mineMse][0]
    print("Minimum Mean Square Error", mineMse)
    print("Index of Minimum Mean Square Error", minPt)

    xaxis = range(len(cvCurve))
    plot.plot(xaxis, cvCurve)

    plot.xlabel("Steps Taken")
    plot.ylabel(("Mean Square Error"))
    plot.show()

输出结果:

Minimum Mean Square Error 0.5873018933136459
Index of Minimum Mean Square Error 311
Minimum Mean Square Error 0.5534955247726759
Index of Minimum Mean Square Error 289
Minimum Mean Square Error 0.5957385843236068
Index of Minimum Mean Square Error 244
Minimum Mean Square Error 0.6163846701751715
Index of Minimum Mean Square Error 265
Minimum Mean Square Error 0.6205467405536572
Index of Minimum Mean Square Error 289
Minimum Mean Square Error 0.6273690438035697
Index of Minimum Mean Square Error 312
Minimum Mean Square Error 0.6214330728517901
Index of Minimum Mean Square Error 285
Minimum Mean Square Error 0.6180113626794431
Index of Minimum Mean Square Error 285
Minimum Mean Square Error 0.6295047735731523
Index of Minimum Mean Square Error 280
Minimum Mean Square Error 0.6494495844086484
Index of Minimum Mean Square Error 285


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值