1 importnumpy2 from sklearn importdatasets, linear_model3 from sklearn.linear_model importLassoCV4 from math importsqrt5 importmatplotlib.pyplot as plot6
7 #read data into iterable
8 #target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
9 #data = urllib2.urlopen(target_url)
10 target_url_file = open('winequality-red.csv','r')11 data =target_url_file.readlines()12 target_url_file.close()13
14 xList =[]15 labels =[]16 names =[]17 firstLine =True18 for line indata:19 iffirstLine:20 names = line.strip().split(";")21 firstLine =False22 else:23 #split on semi-colon
24 row = line.strip().split(";")25 #put labels in separate array
26 labels.append(float(row[-1]))27 #remove label from row
28 row.pop()29 #convert row to floats
30 floatRow = [float(num) for num inrow]31 xList.append(floatRow)32
33 #Normalize columns in x and labels
34 #Note: be careful about normalization.
35 #Some penalized regression packages include it and some don't.
36
37 nrows =len(xList)38 ncols =len(xList[0])39
40 #calculate means and variances
41 xMeans =[]42 xSD =[]43 for i inrange(ncols):44 col = [xList[j][i] for j inrange(nrows)]45 mean = sum(col)/nrows46 xMeans.append(mean)47 colDiff = [(xList[j][i] - mean) for j inrange(nrows)]48 sumSq = sum([colDiff[i] * colDiff[i] for i inrange(nrows)])49 stdDev = sqrt(sumSq/nrows)50 xSD.append(stdDev)51
52 #use calculate mean and standard deviation to normalize xList
53 xNormalized =[]54 for i inrange(nrows):55 rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j inrange(ncols)]56 xNormalized.append(rowNormalized)57
58 #Normalize labels
59 meanLabel = sum(labels)/nrows60 sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)61
62 labelNormalized = [(labels[i] - meanLabel)/sdLabel for i inrange(nrows)]63
64 #Convert list of list to np array for input to sklearn packages
65
66 #Unnormalized labels
67 Y =numpy.array(labels)68
69 #normalized lables
70 Y =numpy.array(labelNormalized)71
72 #Unnormalized X's
73 X =numpy.array(xList)74
75 #Normlized Xss
76 X =numpy.array(xNormalized)77
78 #Call LassoCV from sklearn.linear_model
79 #10折交叉验证
80 wineModel = LassoCV(cv=10).fit(X, Y)81
82 #Display results
83
84
85 plot.figure()86 plot.figure(figsize=(12,8))87 #随着alpha值的变化,均方误差的变化曲线
88 plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')89 #验证过程中,随着alpha值的变化,均方误差的平均曲线,并设置的alpha变化区域
90 plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),91 label='Average MSE Across Folds', linewidth=2)92 #最佳的alpha值,每次验证系统认为的最合适的alpha值
93 plot.axvline(wineModel.alpha_, linestyle='dotted',label='CV Estimate of Best alpha')94 #这种轴半对数刻度曲线是将自变量对10取对数,可以有效的看出数据指数型变化时的衰变情况。
95 plot.semilogx()96 #为图表打标注
97 plot.legend()98 #当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。
99 ax =plot.gca()100 #x轴反向
101 ax.invert_xaxis()102
103 plot.xlabel('alpha')104 plot.ylabel('Mean Square Error')105 plot.axis('tight')106 plot.show()107
108 #print out the value of alpha that minimizes the Cv-error
109 print("alpha Value that Minimizes CV Error",wineModel.alpha_)110 print("Minimum MSE", min(wineModel.mse_path_.mean(axis=-1)))