模型:多元线性回归,稳健回归,偏最小二乘回归,岭回归,lasso回归,弹性网
语言:R语言
参考书:应用预测建模 Applied Predictive Modeling (2013) by Max Kuhn and Kjell Johnson,林荟等译
本案例中的一些模型概念与数据分析思路不做详细解释,因为在之前的博文中已经写过了,是类似的,可以参考同章节的博文。
案例:
导入数据
#载入数据
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
head(ChemicalManufacturingProcess)
> head(ChemicalManufacturingProcess)
Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04
1 38.00 6.25 49.58 56.97 12.74
2 42.44 8.01 60.97 67.48 14.65
3 42.03 8.01 60.97 67.48 14.65
4 41.42 8.01 60.97 67.48 14.65
5 42.49 7.47 63.33 72.25 14.02
6 43.57 6.12 58.36 65.31 15.17
BiologicalMaterial05 BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
1 19.51 43.73 100 16.66
2 19.36 53.14 100 19.04
3 19.36 53.14 100 19.04
4 19.36 53.14 100 19.04
5 17.91 54.66 100 18.22
6 21.79 51.23 100 18.30
BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
1 11.44 3.46 138.09 18.83
2 12.55 3.46 153.67 21.05
3 12.55 3.46 153.67 21.05
4 12.55 3.46 153.67 21.05
5 12.80 3.05 147.61 21.05
6 12.13 3.78 151.88 20.76
ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03 ManufacturingProcess04
1 NA NA NA NA
2 0.0 0 NA 917
3 0.0 0 NA 912
4 0.0 0 NA 911
5 10.7 0 NA 918
6 12.0 0 NA 924
ManufacturingProcess05 ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
1 NA NA NA NA
2 1032.2 210.0 177 178
3 1003.6 207.1 178 178
4 1014.6 213.3 177 177
5 1027.5 205.7 178 178
6 1016.8 208.9 178 178
ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
1 43.00 NA NA NA
2 46.57 NA NA 0
3 45.07 NA NA 0
4 44.92 NA NA 0
5 44.96 NA NA 0
6 45.32 NA NA 0
ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15 ManufacturingProcess16
1 35.5 4898 6108 4682
2 34.0 4869 6095 4617
3 34.8 4878 6087 4617
4 34.8 4897 6102 4635
5 34.6 4992 6233 4733
6 34.0 4985 6222 4786
ManufacturingProcess17 ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
1 35.5 4865 6049 4665
2 34.0 4867 6097 4621
3 34.8 4877 6078 4621
4 34.8 4872 6073 4611
5 33.9 4886 6102 4659
6 33.4 4862 6115 4696
ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
1 0.0 NA NA NA
2 0.0 3 0 3
3 0.0 4 1 4
4 0.0 5 2 5
5 -0.7 8 4 18
6 -0.6 9 1 1
ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27 ManufacturingProcess28
1 4873 6074 4685 10.7
2 4869 6107 4630 11.2
3 4897 6116 4637 11.1
4 4892 6111 4630 11.1
5 4930 6151 4684 11.3
6 4871 6128 4687 11.4
ManufacturingProcess29 ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
1 21.0 9.9 69.1 156
2 21.4 9.9 68.7 169
3 21.3 9.4 69.3 173
4 21.3 9.4 69.3 171
5 21.6 9.0 69.4 171
6 21.7 10.1 68.2 173
ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
1 66 2.4 486 0.019
2 66 2.6 508 0.019
3 66 2.6 509 0.018
4 68 2.5 496 0.018
5 70 2.5 468 0.017
6 70 2.5 490 0.018
ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39 ManufacturingProcess40
1 0.5 3 7.2 NA
2 2.0 2 7.2 0.1
3 0.7 2 7.2 0.0
4 1.2 2 7.2 0.0
5 0.2 2 7.3 0.0
6 0.4 2 7.2 0.0
ManufacturingProcess41 ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
1 NA 11.6 3.0 1.8
2 0.15 11.1 0.9 1.9
3 0.00 12.0 1.0 1.8
4 0.00 10.6 1.1 1.8
5 0.00 11.0 1.1 1.7
6 0.00 11.5 2.2 1.8
ManufacturingProcess45
1 2.4
2 2.2
3 2.3
4 2.1
5 2.1
6 2.0
#将预测变量与响应变量分开
Yield<-ChemicalManufacturingProcess[,1]
Predictors<-ChemicalManufacturingProcess[,-1]
了解缺失值情况。
#数据预处理
#有缺失值的变量所在的位置
NAcol <- which(colSums(is.na(Predictors))>0)
#有缺失值的变量名
na.va <- names(Predictors)[NAcol]
#根据缺失值个数将有缺失值的变量进行降序排列,显示变量名与缺失值个数
sort(colSums(sapply(Predictors[NAcol],is.na)),decreasing = TRUE)
> #根据缺失值个数将有缺失值的变量进行降序排列,显示变量名与缺失值个数
> sort(colSums(sapply(Predictors[NAcol],is.na)),decreasing = TRUE)
ManufacturingProcess03 ManufacturingProcess11 ManufacturingProcess10 ManufacturingProcess25
15 10 9 5
ManufacturingProcess26 ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
5 5 5 5
ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33 ManufacturingProcess34
5 5 5 5
ManufacturingProcess35 ManufacturingProcess36 ManufacturingProcess02 ManufacturingProcess06
5 5 3 2
ManufacturingProcess01 ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
1 1 1 1
ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14 ManufacturingProcess22
1 1 1 1
ManufacturingProcess23 ManufacturingProcess24 ManufacturingProcess40 ManufacturingProcess41
1 1 1 1
用装袋树插补进行缺失值插补。
#缺失值插补
#可用caret包中的preProcess()函数,其中有三种缺失值插补方法:K近邻、装袋树、中位数。
#装袋树插补
trans<-preProcess(Predictors, method=c("bagImpute"), na.remove=FALSE )
Predictors.trans<-predict(trans,Predictors)
summary(Predictors.trans)
head(Predictors.trans[,1:5])
head(Predictors[,1:5])
> head(Predictors.trans[,1:5])
BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04
1 6.25 49.58 56.97 12.74
2 8.01 60.97 67.48 14.65
3 8.01 60.97 67.48 14.65
4 8.01 60.97 67.48 14.65
5 7.47 63.33 72.25 14.02
6 6.12 58.36 65.31 15.17
BiologicalMaterial05
1 19.51
2 19.36
3 19.36
4 19.36
5 17.91
6 21.79
> head(Predictors.trans[,1:5])
BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04
1 6.25 49.58 56.97 12.74
2 8.01 60.97 67.48 14.65
3 8.01 60.97 67.48 14.65
4 8.01 60.97 67.48 14.65
5 7.47 63.33 72.25 14.02
6 6.12 58.36 65.31 15.17
BiologicalMaterial05
1 19.51
2 19.36
3 19.36
4 19.36
5 17.91
6 21.79
偏度变换。
#计算偏度
library(e1071)
summary(apply(Predictors.trans,2,skewness) )
#box-cox变换
boxcox = function(x){
trans = BoxCoxTrans(x)
x_bc = predict( trans, x )
}
Predictors.trans2<-apply( Predictors.trans, 2, boxcox )
Predictors.trans2<-as.data.frame(Predictors.trans2)
#将box-cox的结果赋值到一个列表中,列表中包含57个列表(57个向量的box-cox变换)
trans.boxcox<-list()
for (i in 1:length(Predictors.trans))
trans.boxcox[[i]]<- BoxCoxTrans(Predictors.trans[,i])
#偏度减小
summary(apply(Predictors.trans,2,skewness) )
summary(apply(Predictors.trans2,2,skewness) )
head(Predictors.trans[,1:5])
head(Predictors.trans2[,1:5])
> #偏度减小
> summary(apply(Predictors.trans,2,skewness) )
Min. 1st Qu. Median Mean 3rd Qu. Max.
-12.85958 -1.68180 0.07581 -1.61722 0.37836 9.05487
> summary(apply(Predictors.trans2,2,skewness) )
Min. 1st Qu. Median Mean 3rd Qu. Max.
-12.85958 -1.68180 -0.01018 -1.77944 0.19132 9.05487
>
> head(Predictors.trans[,1:5])
BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04
1 6.25 49.58 56.97 12.74
2 8.01 60.97 67.48 14.65
3 8.01 60.97 67.48 14.65
4 8.01 60.97 67.48 14.65
5 7.47 63.33 72.25 14.02
6 6.12 58.36 65.31 15.17
BiologicalMaterial05
1 19.51
2 19.36
3 19.36
4 19.36
5 17.91
6 21.79
> head(Predictors.trans2[,1:5])
BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04
1 2.442874 1.335635 56.97 0.6143436
2 2.889218 1.348159 67.48 0.6164780
3 2.889218 1.348159 67.48 0.6164780
4 2.889218 1.348159 67.48 0.6164780
5 2.760280 1.350269 72.25 0.6158571
6 2.406565 1.345659 65.31 0.6169406
BiologicalMaterial05
1 2.970927
2 2.963209
3 2.963209
4 2.963209
5 2.885359
6 3.081451
#有些变量没有进行box-cox变换
#有两个原因:1)原本偏度很小,不需要进行
skewness(Predictors.trans2[,c("BiologicalMaterial03")]) # 0.02851075
#1)虽然偏度大,但是数据的唯一值小于3个(默认的numUnique = 3)
skewness(Predictors.trans2[,c("BiologicalMaterial07")])# 7.398664
hist(Predictors.trans$BiologicalMaterial07)
划分训练集与测试集。
#对于小样本数据,其实不应该划分单一的训练集与测试集,而是应该采用重抽样方法。但是考虑到习题学习方法,将其分开。
#分层随机抽样
library(caret)
set.seed(100)
trainingRows<-createDataPartition(Yield,p=.8,list=FALSE) #返回索引矩阵
train.Predictors<-Predictors.trans2[trainingRows,]
test.Predictors<-Predictors.trans2[-trainingRows,]
train.Yield<-Yield[trainingRows]
test.Yield<-Yield[-trainingRows]
#建模
#用重抽样方法对模型预测新样本的能力进行评价
#本次的重抽样方法重复10折交叉验证,建模目的为在几个模型之间进行选择(包括同一个算法选择最优参数与在不同模型之间进行选择)
#设定随机数种子,这样重抽样数据集可以重复
set.seed(100)
indx <- createMultiFolds(train.Yield, k = 10, times = 5)
ctrl <- trainControl(method="repeatedcv",repeats=5,index = indx)
PLS偏最小二乘回归(调优参数:成分数)
#PLS
set.seed(100)
pls.model <- train(x = train.Predictors, y = train.Yield,
method = "kernelpls",#Dayal和MacGregor的第一种核函数算法kernelpls
# tuneGrid = expand.grid(ncomp = 1:10),#通过grid设定成分数,这里不用
tuneLength=40, #40个成分数
trControl = ctrl,
preProc = c("center", "scale"))
pls.model
pls.model$bestTune
testResults <- data.frame(obs = test.Yield,
pls = predict(pls.model,test.Predictors))#存储测试集预测数据
PLS最优成分数1(RMSE最小)
> pls.model
Partial Least Squares
144 samples
57 predictor
Pre-processing: centered (57), scaled (57)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 129, 130, 131, 129, 129, 129, ...
Resampling results across tuning parameters:
ncomp RMSE Rsquared MAE
1 1.476990 0.4065863 1.159411
2 2.081159 0.4493633 1.277200
3 1.721405 0.4886891 1.153332
4 1.630823 0.5042695 1.147206
5 1.843340 0.4824557 1.229122
6 1.950280 0.4759969 1.267067
7 2.136332 0.4633424 1.323315
8 2.262960 0.4553036 1.363631
9 2.511104 0.4423085 1.438149
10 2.672648 0.4353155 1.490455
11 2.898909 0.4240499 1.571533
12 3.137282 0.4118001 1.657344
13 3.210455 0.4060440 1.684312
14 3.199930 0.3960440 1.681212
15 3.219084 0.3980042 1.689407
16 3.217109 0.3999200 1.691913
17 3.268484 0.3950983 1.705622
18 3.292657 0.3967421 1.699362
19 3.303488 0.4029003 1.685211
20 3.360074 0.4050533 1.695239
21 3.424481 0.4033245 1.717505
22 3.446761 0.4017849 1.729036
23 3.467865 0.3987998 1.738381
24 3.554200 0.3919026 1.763032
25 3.681167 0.3821253 1.805953
26 3.776590 0.3766263 1.841712
27 3.890946 0.3731292 1.877997
28 3.975647 0.3694684 1.908209
29 4.041655 0.3681282 1.934885
30 4.023776 0.3651582 1.932209
31 3.993455 0.3663747 1.921392
32 3.981448 0.3670410 1.916589
33 3.992171 0.3702859 1.922430
34 4.057324 0.3764081 1.942431
35 4.155892 0.3777213 1.965801
36 4.190437 0.3766064 1.977268
37 4.176134 0.3800273 1.972193
38 4.197296 0.3845478 1.975958
39 4.321553 0.3836802 2.008199
40 4.264780 0.3898474 1.991462
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was ncomp = 1.
多元线性回归 (无调优参数)
#多元线性回归
set.seed(100)
lm.model <- train(x = train.Predictors, y = train.Yield,
method = "lm",
trControl = ctrl,
preProc = c("center", "scale"))
lm.model
testResults$lm<-predict(lm.model,test.Predictors)
> lm.model
Linear Regression
144 samples
57 predictor
Pre-processing: centered (57), scaled (57)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 129, 130, 131, 129, 129, 129, ...
Resampling results:
RMSE Rsquared MAE
4.477628 0.3575784 2.099445
Tuning parameter 'intercept' was held constant at a value of TRUE
稳健回归(调优参数:是否有截距,目标函数)
#稳健回归
#如果预测变量数据框为奇异矩阵,则不能直接使用稳健回归
#可以用pca对预测变量进行预处理,然后再使用稳健回归 preProc = c("pca")
set.seed(100)
rlm.model <- train(x = train.Predictors, y = train.Yield,
method = "rlm",
trControl = ctrl,
preProc = c("center", "scale"))
rlm.model
testResults$rlm<-predict(rlm.model,test.Predictors)
最优参数:intercept = TRUE and psi = psi.huber
> rlm.model
Robust Linear Model
144 samples
57 predictor
Pre-processing: centered (57), scaled (57)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 129, 130, 131, 129, 129, 129, ...
Resampling results across tuning parameters:
intercept psi RMSE Rsquared MAE
FALSE psi.huber 40.741181 0.3575856 40.416526
FALSE psi.hampel 40.741181 0.3575856 40.416526
FALSE psi.bisquare 40.738137 0.3576577 40.414645
TRUE psi.huber 4.892939 0.3354144 2.224460
TRUE psi.hampel 4.991625 0.3398510 2.256174
TRUE psi.bisquare 6.253662 0.2660194 2.706803
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were intercept = TRUE and psi = psi.huber.
弹性网(调优参数:岭回归罚lambda,lasso罚fraction)
#enet弹性网
#弹性网模型同时具有岭回归罚参数和lasso 罚参数
#lambda为岭回归罚参数(当lambda为0时即为纯lasso模型)
#fraction为lasso罚参数,当lasso罚参数为0时即为纯岭回归模型
enetGrid <- expand.grid(lambda = seq(0, 1, length = 10),
fraction = seq(0, 1, length = 20))
set.seed(100)
enet.model<- train(x = train.Predictors, y = train.Yield,
method = "enet", #弹性网 elastic net
tuneGrid = enetGrid,
trControl = ctrl,
preProc = c("center", "scale"))
enet.model
testResults$enet<-predict(enet.model,test.Predictors)
最优参数 fraction = 0.3157895 and lambda = 0.1111111.
> enet.model
Elasticnet
144 samples
57 predictor
Pre-processing: centered (57), scaled (57)
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 129, 130, 131, 129, 129, 129, ...
Resampling results across tuning parameters:
lambda fraction RMSE Rsquared MAE
0.0000000 0.00000000 1.771408 NaN 1.4514918
0.0000000 0.05263158 1.137314 0.6195010 0.9080630
0.0000000 0.10526316 1.201264 0.5906999 0.9417703
0.0000000 0.15789474 1.977967 0.4609016 1.2368802
0.0000000 0.21052632 2.429146 0.4323302 1.3829538
0.0000000 0.26315789 2.647033 0.4223792 1.4581762
0.0000000 0.31578947 2.787488 0.4175743 1.5151082
0.0000000 0.36842105 2.883314 0.4103987 1.5506618
0.0000000 0.42105263 2.789688 0.4231831 1.5239585
0.0000000 0.47368421 2.751187 0.4208489 1.5169774
0.0000000 0.52631579 2.907013 0.4097454 1.5701907
0.0000000 0.57894737 3.001104 0.4113893 1.6000812
0.0000000 0.63157895 3.096753 0.4081374 1.6340639
0.0000000 0.68421053 3.265198 0.4022320 1.6831637
0.0000000 0.73684211 3.461160 0.3970032 1.7438772
0.0000000 0.78947368 3.640831 0.3881002 1.8073188
0.0000000 0.84210526 3.776349 0.3857004 1.8578005
0.0000000 0.89473684 3.970737 0.3790861 1.9290039
0.0000000 0.94736842 4.212998 0.3662999 2.0140739
0.0000000 1.00000000 4.477628 0.3575784 2.0994452
0.1111111 0.00000000 1.771408 NaN 1.4514918
0.1111111 0.05263158 1.569523 0.4811618 1.2722767
0.1111111 0.10526316 1.416838 0.5701030 1.1445218
0.1111111 0.15789474 1.289815 0.6004227 1.0468167
0.1111111 0.21052632 1.205665 0.6089252 0.9782767
0.1111111 0.26315789 1.159554 0.6118270 0.9356370
0.1111111 0.31578947 1.137137 0.6154347 0.9077345
0.1111111 0.36842105 1.159531 0.6029658 0.9222230
0.1111111 0.42105263 1.196985 0.5883347 0.9417218
0.1111111 0.47368421 1.246828 0.5809674 0.9624214
0.1111111 0.52631579 1.328806 0.5630178 0.9978619
0.1111111 0.57894737 1.432425 0.5434760 1.0433153
0.1111111 0.63157895 1.580462 0.5235473 1.1023351
0.1111111 0.68421053 1.732672 0.5105250 1.1554499
0.1111111 0.73684211 1.842935 0.5023709 1.1947970
0.1111111 0.78947368 1.900477 0.4959536 1.2189988
0.1111111 0.84210526 1.949361 0.4905344 1.2389754
0.1111111 0.89473684 1.996214 0.4856145 1.2571256
0.1111111 0.94736842 2.030594 0.4825339 1.2706286
0.1111111 1.00000000 2.061919 0.4802416 1.2819687
0.2222222 0.00000000 1.771408 NaN 1.4514918
0.2222222 0.05263158 1.582699 0.4739626 1.2842008
0.2222222 0.10526316 1.445378 0.5558647 1.1680481
0.2222222 0.15789474 1.319676 0.5962787 1.0705540
0.2222222 0.21052632 1.234360 0.6023935 1.0029017
0.2222222 0.26315789 1.185592 0.5985457 0.9643072
0.2222222 0.31578947 1.157142 0.6040956 0.9311194
0.2222222 0.36842105 1.147840 0.6053733 0.9183014
0.2222222 0.42105263 1.172490 0.5968102 0.9295206
0.2222222 0.47368421 1.216252 0.5819822 0.9529644
0.2222222 0.52631579 1.280461 0.5705332 0.9822868
0.2222222 0.57894737 1.335024 0.5588436 1.0078476
0.2222222 0.63157895 1.412176 0.5459754 1.0449413
0.2222222 0.68421053 1.510595 0.5332021 1.0871973
0.2222222 0.73684211 1.626394 0.5217768 1.1301981
0.2222222 0.78947368 1.742335 0.5124071 1.1723639
0.2222222 0.84210526 1.832459 0.5052867 1.2072798
0.2222222 0.89473684 1.876415 0.5004111 1.2264522
0.2222222 0.94736842 1.903996 0.4963218 1.2392167
0.2222222 1.00000000 1.929789 0.4926550 1.2503069
0.3333333 0.00000000 1.771408 NaN 1.4514918
0.3333333 0.05263158 1.586386 0.4708847 1.2875822
0.3333333 0.10526316 1.452081 0.5512157 1.1736360
0.3333333 0.15789474 1.328902 0.5913763 1.0781400
0.3333333 0.21052632 1.247915 0.5930494 1.0154428
0.3333333 0.26315789 1.198227 0.5903310 0.9756537
0.3333333 0.31578947 1.168400 0.5952703 0.9449571
0.3333333 0.36842105 1.156623 0.5992980 0.9261069
0.3333333 0.42105263 1.164238 0.6000730 0.9250761
0.3333333 0.47368421 1.213014 0.5856079 0.9505405
0.3333333 0.52631579 1.282325 0.5692635 0.9860519
0.3333333 0.57894737 1.327711 0.5607814 1.0108058
0.3333333 0.63157895 1.383353 0.5514807 1.0393276
0.3333333 0.68421053 1.458208 0.5428085 1.0742131
0.3333333 0.73684211 1.553698 0.5322740 1.1146072
0.3333333 0.78947368 1.660374 0.5212841 1.1565411
0.3333333 0.84210526 1.757325 0.5129770 1.1938817
0.3333333 0.89473684 1.835645 0.5071735 1.2235668
0.3333333 0.94736842 1.888216 0.5029577 1.2438691
0.3333333 1.00000000 1.913020 0.5002990 1.2544885
0.4444444 0.00000000 1.771408 NaN 1.4514918
0.4444444 0.05263158 1.587131 0.4689106 1.2882833
0.4444444 0.10526316 1.453055 0.5481805 1.1742613
0.4444444 0.15789474 1.332308 0.5855582 1.0814117
0.4444444 0.21052632 1.255529 0.5838330 1.0224983
0.4444444 0.26315789 1.203512 0.5852176 0.9797646
0.4444444 0.31578947 1.173833 0.5898226 0.9504253
0.4444444 0.36842105 1.165086 0.5937041 0.9336316
0.4444444 0.42105263 1.170915 0.5962111 0.9304951
0.4444444 0.47368421 1.208156 0.5887600 0.9506425
0.4444444 0.52631579 1.286901 0.5709330 0.9911093
0.4444444 0.57894737 1.337078 0.5617701 1.0221866
0.4444444 0.63157895 1.393842 0.5526263 1.0527241
0.4444444 0.68421053 1.455623 0.5470166 1.0846393
0.4444444 0.73684211 1.543548 0.5377431 1.1221049
0.4444444 0.78947368 1.637762 0.5284316 1.1614069
0.4444444 0.84210526 1.731552 0.5202815 1.1979213
0.4444444 0.89473684 1.813850 0.5136989 1.2286483
0.4444444 0.94736842 1.884535 0.5086537 1.2562324
0.4444444 1.00000000 1.943077 0.5045050 1.2793636
0.5555556 0.00000000 1.771408 NaN 1.4514918
0.5555556 0.05263158 1.586531 0.4672084 1.2879585
0.5555556 0.10526316 1.452053 0.5445135 1.1732624
0.5555556 0.15789474 1.333879 0.5782891 1.0834082
0.5555556 0.21052632 1.258416 0.5769443 1.0254727
0.5555556 0.26315789 1.205719 0.5811876 0.9806279
0.5555556 0.31578947 1.177785 0.5856236 0.9530035
0.5555556 0.36842105 1.172103 0.5895514 0.9388161
0.5555556 0.42105263 1.180745 0.5920771 0.9366191
0.5555556 0.47368421 1.213233 0.5891035 0.9550200
0.5555556 0.52631579 1.287258 0.5743233 0.9958625
0.5555556 0.57894737 1.354102 0.5628870 1.0350657
0.5555556 0.63157895 1.418334 0.5526315 1.0716292
0.5555556 0.68421053 1.479213 0.5473784 1.1046641
0.5555556 0.73684211 1.560439 0.5396893 1.1414543
0.5555556 0.78947368 1.650510 0.5314932 1.1791929
0.5555556 0.84210526 1.750111 0.5227778 1.2187775
0.5555556 0.89473684 1.847066 0.5149045 1.2559155
0.5555556 0.94736842 1.928713 0.5088652 1.2880741
0.5555556 1.00000000 1.999244 0.5040547 1.3159064
0.6666667 0.00000000 1.771408 NaN 1.4514918
0.6666667 0.05263158 1.584958 0.4664071 1.2868267
0.6666667 0.10526316 1.449876 0.5411684 1.1713837
0.6666667 0.15789474 1.333715 0.5716620 1.0839127
0.6666667 0.21052632 1.258738 0.5719035 1.0261469
0.6666667 0.26315789 1.206485 0.5778601 0.9801056
0.6666667 0.31578947 1.180528 0.5825003 0.9541922
0.6666667 0.36842105 1.178628 0.5862847 0.9436771
0.6666667 0.42105263 1.192490 0.5880606 0.9441044
0.6666667 0.47368421 1.224793 0.5874243 0.9625853
0.6666667 0.52631579 1.295060 0.5753557 1.0063528
0.6666667 0.57894737 1.370252 0.5637588 1.0488886
0.6666667 0.63157895 1.446135 0.5523346 1.0922606
0.6666667 0.68421053 1.513782 0.5456938 1.1302661
0.6666667 0.73684211 1.598973 0.5378909 1.1714708
0.6666667 0.78947368 1.698041 0.5297115 1.2135679
0.6666667 0.84210526 1.801671 0.5210069 1.2555559
0.6666667 0.89473684 1.905494 0.5126450 1.2957071
0.6666667 0.94736842 1.994592 0.5060740 1.3307047
0.6666667 1.00000000 2.069366 0.5011485 1.3602098
0.7777778 0.00000000 1.771408 NaN 1.4514918
0.7777778 0.05263158 1.582633 0.4667323 1.2850102
0.7777778 0.10526316 1.447461 0.5375241 1.1693484
0.7777778 0.15789474 1.332385 0.5659956 1.0835350
0.7777778 0.21052632 1.257670 0.5683465 1.0251626
0.7777778 0.26315789 1.206720 0.5749648 0.9792782
0.7777778 0.31578947 1.183032 0.5799951 0.9553337
0.7777778 0.36842105 1.186004 0.5833493 0.9480028
0.7777778 0.42105263 1.205879 0.5844341 0.9527527
0.7777778 0.47368421 1.241034 0.5844970 0.9737760
0.7777778 0.52631579 1.309416 0.5751552 1.0206340
0.7777778 0.57894737 1.387861 0.5644215 1.0672702
0.7777778 0.63157895 1.473547 0.5522845 1.1155053
0.7777778 0.68421053 1.561405 0.5423200 1.1641709
0.7777778 0.73684211 1.655748 0.5340078 1.2105331
0.7777778 0.78947368 1.757136 0.5262852 1.2541173
0.7777778 0.84210526 1.866682 0.5174869 1.2986273
0.7777778 0.89473684 1.972825 0.5094815 1.3401758
0.7777778 0.94736842 2.067288 0.5029539 1.3770679
0.7777778 1.00000000 2.145792 0.4980351 1.4079876
0.8888889 0.00000000 1.771408 NaN 1.4514918
0.8888889 0.05263158 1.579908 0.4676201 1.2827855
0.8888889 0.10526316 1.444725 0.5339888 1.1669361
0.8888889 0.15789474 1.330248 0.5611672 1.0825854
0.8888889 0.21052632 1.256084 0.5652489 1.0233329
0.8888889 0.26315789 1.206677 0.5724267 0.9783037
0.8888889 0.31578947 1.185758 0.5777748 0.9564875
0.8888889 0.36842105 1.193536 0.5808347 0.9516982
0.8888889 0.42105263 1.220419 0.5810332 0.9628614
0.8888889 0.47368421 1.260288 0.5810382 0.9883485
0.8888889 0.52631579 1.328078 0.5740733 1.0378387
0.8888889 0.57894737 1.408201 0.5644060 1.0893592
0.8888889 0.63157895 1.504790 0.5520081 1.1427235
0.8888889 0.68421053 1.611741 0.5393310 1.1992535
0.8888889 0.73684211 1.716812 0.5300661 1.2516697
0.8888889 0.78947368 1.822557 0.5225311 1.2977908
0.8888889 0.84210526 1.934361 0.5139993 1.3436852
0.8888889 0.89473684 2.043430 0.5064717 1.3865103
0.8888889 0.94736842 2.142513 0.5002002 1.4251330
0.8888889 1.00000000 2.225024 0.4954006 1.4580002
1.0000000 0.00000000 1.771408 NaN 1.4514918
1.0000000 0.05263158 1.577125 0.4682588 1.2804563
1.0000000 0.10526316 1.441717 0.5306579 1.1643440
1.0000000 0.15789474 1.327700 0.5569656 1.0811085
1.0000000 0.21052632 1.254219 0.5624930 1.0209944
1.0000000 0.26315789 1.206661 0.5701069 0.9775131
1.0000000 0.31578947 1.188900 0.5756304 0.9578117
1.0000000 0.36842105 1.201539 0.5785225 0.9554112
1.0000000 0.42105263 1.235590 0.5780937 0.9736209
1.0000000 0.47368421 1.281713 0.5775492 1.0056641
1.0000000 0.52631579 1.350101 0.5725023 1.0579131
1.0000000 0.57894737 1.432783 0.5638335 1.1140402
1.0000000 0.63157895 1.536828 0.5518440 1.1709872
1.0000000 0.68421053 1.658394 0.5371996 1.2332905
1.0000000 0.73684211 1.776760 0.5267168 1.2927635
1.0000000 0.78947368 1.888808 0.5190630 1.3426238
1.0000000 0.84210526 2.002729 0.5110275 1.3893781
1.0000000 0.89473684 2.115060 0.5039072 1.4332097
1.0000000 0.94736842 2.218818 0.4979082 1.4733968
1.0000000 1.00000000 2.305433 0.4932381 1.5080460
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were fraction = 0.3157895 and lambda = 0.1111111.
模型比较
#模型比较
#resamples函数可以分析和可视化重抽样的结果
resamp <- resamples( list(lm=lm.model,rlm=rlm.model,pls=pls.model,enet=enet.model) )
summary(resamp)
dotplot( resamp, metric="RMSE" )
summary(diff(resamp))
> summary(resamp)
Call:
summary.resamples(object = resamp)
Models: lm, rlm, pls, enet
Number of resamples: 50
MAE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lm 0.6607011 1.0051993 1.3579719 2.0994452 2.0714864 11.214245 0
rlm 0.6847512 1.0559838 1.3709089 2.2244596 2.4281025 8.581744 0
pls 0.6938768 1.0405545 1.1263476 1.1594108 1.2788073 1.751240 0
enet 0.5054388 0.8031609 0.9130662 0.9077345 0.9978196 1.452202 0
RMSE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lm 0.8161917 1.3401882 1.623232 4.477628 3.810345 40.527543 0
rlm 0.8559126 1.3783897 1.762224 4.892939 4.049045 29.072243 0
pls 0.8480551 1.2313692 1.402683 1.476990 1.622498 2.567279 0
enet 0.6314113 0.9937715 1.148241 1.137137 1.261225 1.705158 0
Rsquared
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lm 1.741799e-05 0.1117673 0.4098811 0.3575784 0.5367250 0.7717324 0
rlm 9.170329e-05 0.1208714 0.3832505 0.3354144 0.5393142 0.7368865 0
pls 1.146817e-01 0.2801540 0.4181062 0.4065863 0.5277755 0.7643387 0
enet 2.670573e-01 0.5087171 0.6200745 0.6154347 0.7196441 0.9092751 0
> dotplot( resamp, metric="RMSE" )
>
> summary(diff(resamp))
Call:
summary.diff.resamples(object = diff(resamp))
p-value adjustment: bonferroni
Upper diagonal: estimates of the difference
Lower diagonal: p-value for H0: difference = 0
MAE
lm rlm pls enet
lm -0.1250 0.9400 1.1917
rlm 1.0000000 1.0650 1.3167
pls 0.0205577 0.0014010 0.2517
enet 0.0027009 0.0001186 3.862e-12
RMSE
lm rlm pls enet
lm -0.4153 3.0006 3.3405
rlm 1.000000 3.4159 3.7558
pls 0.046525 0.004073 0.3399
enet 0.024330 0.001918 4.322e-08
Rsquared
lm rlm pls enet
lm 0.02216 -0.04901 -0.25786
rlm 0.60417 -0.07117 -0.28002
pls 0.46701 0.06187 -0.20885
enet 8.115e-10 9.614e-11 3.879e-12
根据RMSE,选择弹性网。
#训练集上的RMSE
summary(resamp$values$`enet~RMSE`)
#测试集上的RMSE
RMSE(testResults$enet,testResults$obs)
模型在测试集上的RMSE为1.309,比重抽样训练集RMSE的均值大15%.
> #训练集上的RMSE
> summary(resamp$values$`enet~RMSE`)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.6314 0.9938 1.1482 1.1371 1.2612 1.7052
> #测试集上的RMSE
> RMSE(testResults$enet,testResults$obs)
[1] 1.309685
#为了了解哪些预测变量用于建模,用enet函数预测模型,然后在predict 方法中指定type =” coefficients"
library(elasticnet)
lassoModel <- enet(x = as.matrix(train.Predictors), y = train.Yield,
lambda = 0.1111111,normalize = TRUE)
enetCoef <- predict(lassoModel, newx = as.matrix(solTestXtrans),
s= 0.3157895 , mode = "fraction",
type = "coefficients")
esort(enetCoef$coefficients[enetCoef$coefficients!=0],decreasing = TRUE)
可见,生物学指标2个,生产过程指标11个
> sort(enetCoef$coefficients[enetCoef$coefficients!=0],decreasing = TRUE)
ManufacturingProcess06 ManufacturingProcess32 BiologicalMaterial06 ManufacturingProcess11
2.307495e+05 3.664539e+03 3.270911e+00 2.609645e-01
ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess34 BiologicalMaterial03
4.030470e-02 2.183782e-02 1.920971e-02 1.393844e-02
ManufacturingProcess09 ManufacturingProcess37 ManufacturingProcess36 ManufacturingProcess17
4.606955e-03 -3.565983e-02 -5.896368e+00 -5.467550e+03
ManufacturingProcess13
-7.040046e+03
首先,预测变量系数的正负 表明生产率是与预测变量是正相关还是负相关。
然后,探究预测变量与生产率的关系。
以生产过程指标ManufacturingProcess32为例:
#作图,原始的预测变量与响应变量的函数关系图
#使用原始预测变量(只是填补了缺失值)
#某个预测变量的最大值与最小值
p_range <- range( Predictors.trans$ManufacturingProcess32 )
#创建该预测变量最大值与最小值范围内的一个向量,100维向量
variation <- seq( from=p_range[1], to=p_range[2], length.out=100 )
#计算原每个预测变量的均值 57维向量
mean_predictor_values <- apply( Predictors.trans, 2, mean )
#repmat函数所需要的包
install.packages('pracma')
library(pracma)
#repmat(a,m,n) creates a large matrix consisting of an m-by-n tiling of copies of a.
#复制矩阵 将57维向量(每个预测变量的均值)转变为100*57的矩阵
newdata <- repmat( as.double(mean_predictor_values), length(variation), 1 )
newdata <- data.frame( newdata )
colnames( newdata ) <- colnames( Predictors.trans )
#把原始预测变量数据框转变为这样的数据框:其他值都为其均值,只有进行分析的那个预测变量为变化值
newdata$ManufacturingProcess32 <- variation
#将新数据框进行box-cox变换(使用的lambda值为建模时使用的lambda值)
#将box-cox的结果赋值到一个列表中,列表中包含57个列表(57个向量的box-cox变换)
trans.boxcox<-list()
for (i in 1:length(Predictors.trans))
trans.boxcox[[i]]<- BoxCoxTrans(Predictors.trans[,i])
newdata.trans<-newdata
for (i in 1:length(Predictors.trans))
newdata.trans[,i]<-predict(trans.boxcox[[i]],newdata[,i])
#x轴为分析的预测变量变化值
xs <- variation
#Y轴为用新数据框进行预测的响应变量值
y_hat <- predict( enet.model, newdata=as.matrix(newdata.trans))
#作图
plot( xs, y_hat, xlab='variation in ManufacturingProcess32', ylab='predicted yield' )
> p_range
[1] 143 173
> variation
[1] 143.0000 143.3030 143.6061 143.9091 144.2121 144.5152 144.8182 145.1212 145.4242 145.7273
[11] 146.0303 146.3333 146.6364 146.9394 147.2424 147.5455 147.8485 148.1515 148.4545 148.7576
[21] 149.0606 149.3636 149.6667 149.9697 150.2727 150.5758 150.8788 151.1818 151.4848 151.7879
[31] 152.0909 152.3939 152.6970 153.0000 153.3030 153.6061 153.9091 154.2121 154.5152 154.8182
[41] 155.1212 155.4242 155.7273 156.0303 156.3333 156.6364 156.9394 157.2424 157.5455 157.8485
[51] 158.1515 158.4545 158.7576 159.0606 159.3636 159.6667 159.9697 160.2727 160.5758 160.8788
[61] 161.1818 161.4848 161.7879 162.0909 162.3939 162.6970 163.0000 163.3030 163.6061 163.9091
[71] 164.2121 164.5152 164.8182 165.1212 165.4242 165.7273 166.0303 166.3333 166.6364 166.9394
[81] 167.2424 167.5455 167.8485 168.1515 168.4545 168.7576 169.0606 169.3636 169.6667 169.9697
[91] 170.2727 170.5758 170.8788 171.1818 171.4848 171.7879 172.0909 172.3939 172.6970 173.0000
> head(newdata[1:5,1:5])
BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04
1 6.41142 55.68875 67.705 12.34926
2 6.41142 55.68875 67.705 12.34926
3 6.41142 55.68875 67.705 12.34926
4 6.41142 55.68875 67.705 12.34926
5 6.41142 55.68875 67.705 12.34926
BiologicalMaterial05
1 18.59864
2 18.59864
3 18.59864
4 18.59864
5 18.59864
> head(newdata[1:5,c('ManufacturingProcess32')])
[1] 143.0000 143.3030 143.6061 143.9091 144.2121
对于生产过程指标ManufacturingProcess32,提高5,约可将生产率提高一个百分点。
由于时间限制,其他预测变量不再赘述。