import urllib.request
import numpy
from sklearn import datasets, linear_model
from math import sqrt
import matplotlib.pyplot as plot
#从网页中读取数据
target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv")
data = urllib.request.urlopen(target_url)
#将数据中第一行的属性读取出来放在names列表中,将其他行的数组读入row中,并将row中最后一列提取
#出来放在labels中作为标签,并使用pop将该列从row去去除掉,最后将剩下的属性值转化为float类型存入xList中
xList = []
labels = []
names = []
firstLine = True
for line in data:
if firstLine == True:
names = line.strip().split(";".encode(encoding='utf-8'))
firstLine = False
else:
row = line.strip().split(";".encode(encoding='utf-8'))
labels.append(float(row[-1]))
row.pop()
floatRow = [float(num) for num in row]
xList.append(floatRow)
#以下程序主要是求解每一列属性的平均值和标准差
nrows = len(xList)
ncols = len(xList[0])
xMeans = []
xSD = []
for i in range(ncols):
col = [xList[j][i] for j in range(nrows)]
mean = sum(col) / nrows
xMeans.append(mean)
colDiff = [(xList[j][i] - mean) for j in range(nrows)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
stdDev = sqrt(sumSq / nrows)
xSD.append(stdDev)
#通过上面程序求解的每一列属性的均值和标准差,接下来就可以对xList中的每一个元素进行归一化
xNormalized = []
for i in range(nrows):
rowNormalized = [(xList[i][j] - xMeans[j]) / xSD[j] for j in range(ncols)]
xNormalized.append(rowNormalized)
#同样需要对标签中的数值进行标准化
meanLabel = sum(labels) / nrows
sdLabel = sqrt(sum((labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)) / nrows)
labelNormalized = [(labels[i] - meanLabel) / sdLabel for i in range(nrows)]
#各种初始化
nxval = 10
nSteps = 350
stepSize = 0.004
errors = []
for i in range(nSteps):
b = []
errors.append(b)
#进行10折交叉验证
for ixval in range(nxval):
idxTest = [a for a in range(nrows) if a%nxval == ixval]#原代码是这样的idxTest = [a for a in range(nrows) if a%nxval == ixval*nxval]
#这样只能进行反复的1折验证,后来我改进了代码,使其可以真正做到10折交叉验证
idxTrain = [a for a in range(nrows) if a%nxval != ixval]#同上
xTrain = [xNormalized[r] for r in idxTrain]
xTest = [xNormalized[r] for r in idxTest]
labelTrain = [labelNormalized[r] for r in idxTrain]
labelTest = [labelNormalized[r] for r in idxTest]
nrowsTrain = len(idxTrain)
nrowsTest = len(idxTest)
beta = [0.0] * ncols
betaMat = []
betaMat.append(list(beta))
#进行350步的迭代,在每一次的交叉验证中都会进行350次的迭代,所以以下代码都将放在10折的交叉验证循环中,这样就可以得到10次交叉验证的最小平方误差和效果图
for iStep in range(nSteps):
residuals = [0.0] * nrows
for j in range(nrowsTrain):
labelsHat = sum([xTrain[j][k] * beta[k] for k in range(ncols)])
residuals[j] = labelTrain[j] - labelsHat
corr = [0.0] * ncols
for j in range(ncols):
corr[j] = sum([xTrain[k][j] * residuals[k] for k in range(nrowsTrain)]) / nrowsTrain#每一列属性的每一行元素与残差中对应的行元素相乘,然后得到属性的一列元素将其相加除以这列元素的行数
#最终得到的结果即为属性与残差的相关性,有j列属性,就得到j列相关性的值[ , , , , ]
iStar = 0
corrStar = corr[0]
for j in range(1, (ncols)):
if abs(corrStar) < abs(corr[j]):
iStar = j
corrStar = corr[j]
beta[iStar] += stepSize * corrStar / abs(corrStar)
betaMat.append(list(beta))
for j in range(nrowsTest):
labelsHat = sum([xTest[j][k] * beta[k] for k in range(ncols)])
err = labelTest[j] - labelsHat
errors[iStep].append(err)
#print("errors= ", errors)
cvCurve = []
for errVect in errors:
mse = sum([x*x for x in errVect]) / len(errVect)
cvCurve.append(mse)
mineMse = min(cvCurve)
minPt = [i for i in range(len(cvCurve)) if cvCurve[i] == mineMse][0]
print("Minimum Mean Square Error", mineMse)
print("Index of Minimum Mean Square Error", minPt)
xaxis = range(len(cvCurve))
plot.plot(xaxis, cvCurve)
plot.xlabel("Steps Taken")
plot.ylabel(("Mean Square Error"))
plot.show()
输出结果:
Minimum Mean Square Error 0.5873018933136459
Index of Minimum Mean Square Error 311
Minimum Mean Square Error 0.5534955247726759
Index of Minimum Mean Square Error 289
Minimum Mean Square Error 0.5957385843236068
Index of Minimum Mean Square Error 244
Minimum Mean Square Error 0.6163846701751715
Index of Minimum Mean Square Error 265
Minimum Mean Square Error 0.6205467405536572
Index of Minimum Mean Square Error 289
Minimum Mean Square Error 0.6273690438035697
Index of Minimum Mean Square Error 312
Minimum Mean Square Error 0.6214330728517901
Index of Minimum Mean Square Error 285
Minimum Mean Square Error 0.6180113626794431
Index of Minimum Mean Square Error 285
Minimum Mean Square Error 0.6295047735731523
Index of Minimum Mean Square Error 280
Minimum Mean Square Error 0.6494495844086484
Index of Minimum Mean Square Error 285