python lasso做交叉验证_[学习笔记][Python机器学习:预测分析核心算法][多变量回归:使用交叉验证来估计套索模型的样本外错误]...

1 importnumpy2 from sklearn importdatasets, linear_model3 from sklearn.linear_model importLassoCV4 from math importsqrt5 importmatplotlib.pyplot as plot6

7 #read data into iterable

8 #target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

9 #data = urllib2.urlopen(target_url)

10 target_url_file = open('winequality-red.csv','r')11 data =target_url_file.readlines()12 target_url_file.close()13

14 xList =[]15 labels =[]16 names =[]17 firstLine =True18 for line indata:19 iffirstLine:20 names = line.strip().split(";")21 firstLine =False22 else:23 #split on semi-colon

24 row = line.strip().split(";")25 #put labels in separate array

26 labels.append(float(row[-1]))27 #remove label from row

28 row.pop()29 #convert row to floats

30 floatRow = [float(num) for num inrow]31 xList.append(floatRow)32

33 #Normalize columns in x and labels

34 #Note: be careful about normalization.

35 #Some penalized regression packages include it and some don't.

36

37 nrows =len(xList)38 ncols =len(xList[0])39

40 #calculate means and variances

41 xMeans =[]42 xSD =[]43 for i inrange(ncols):44 col = [xList[j][i] for j inrange(nrows)]45 mean = sum(col)/nrows46 xMeans.append(mean)47 colDiff = [(xList[j][i] - mean) for j inrange(nrows)]48 sumSq = sum([colDiff[i] * colDiff[i] for i inrange(nrows)])49 stdDev = sqrt(sumSq/nrows)50 xSD.append(stdDev)51

52 #use calculate mean and standard deviation to normalize xList

53 xNormalized =[]54 for i inrange(nrows):55 rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j inrange(ncols)]56 xNormalized.append(rowNormalized)57

58 #Normalize labels

59 meanLabel = sum(labels)/nrows60 sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)61

62 labelNormalized = [(labels[i] - meanLabel)/sdLabel for i inrange(nrows)]63

64 #Convert list of list to np array for input to sklearn packages

65

66 #Unnormalized labels

67 Y =numpy.array(labels)68

69 #normalized lables

70 Y =numpy.array(labelNormalized)71

72 #Unnormalized X's

73 X =numpy.array(xList)74

75 #Normlized Xss

76 X =numpy.array(xNormalized)77

78 #Call LassoCV from sklearn.linear_model

79 #10折交叉验证

80 wineModel = LassoCV(cv=10).fit(X, Y)81

82 #Display results

83

84

85 plot.figure()86 plot.figure(figsize=(12,8))87 #随着alpha值的变化,均方误差的变化曲线

88 plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')89 #验证过程中,随着alpha值的变化,均方误差的平均曲线,并设置的alpha变化区域

90 plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),91 label='Average MSE Across Folds', linewidth=2)92 #最佳的alpha值,每次验证系统认为的最合适的alpha值

93 plot.axvline(wineModel.alpha_, linestyle='dotted',label='CV Estimate of Best alpha')94 #这种轴半对数刻度曲线是将自变量对10取对数,可以有效的看出数据指数型变化时的衰变情况。

95 plot.semilogx()96 #为图表打标注

97 plot.legend()98 #当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。

99 ax =plot.gca()100 #x轴反向

101 ax.invert_xaxis()102

103 plot.xlabel('alpha')104 plot.ylabel('Mean Square Error')105 plot.axis('tight')106 plot.show()107

108 #print out the value of alpha that minimizes the Cv-error

109 print("alpha Value that Minimizes CV Error",wineModel.alpha_)110 print("Minimum MSE", min(wineModel.mse_path_.mean(axis=-1)))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值