《应用预测建模》Applied Predictive Modeling (2013) by Max Kuhn and Kjell Johnson,林荟等译
第三章 数据预处理
3. 3 本书第5 章会介绍定量构效关系( QSAR )模型。该模型用化合物的特征去预测其他的化学性质。caret 包中有一个来自Mente 和Lombardo (2005 )的数据集。数据包含对208 种化合物渗透血脑屏障能力的实验检测。每种化合物对应134 个描述变量。
( a )打开R 用如下命令载入数据:
数值型结果变量包含在向量logBBB 中,预测变量包含在数据框bbbDescr 中。
( b)是否有某些预测变量服从退化的分布?
( c)总体来看,预测变量间是否有强相关性?如果有,如何降低这种相关性?这对用于建模的预测变量数目是否会有很大影响?
( a )打开R 用如下命令载入数据:
数值型结果变量包含在向量logBBB 中,预测变量包含在数据框bbbDescr 中。
library(caret)
data(BloodBrain)
str(bbbDescr)
head(bbbDescr)
str(logBBB)
head(logBBB)
> str(bbbDescr)
'data.frame': 208 obs. of 134 variables:
$ tpsa : num 12 49.3 50.5 37.4 37.4 ...
$ nbasic : int 1 0 1 0 1 1 1 1 1 1 ...
$ negative : int 0 0 0 0 0 0 0 0 0 0 ...
$ vsa_hyd : num 167.1 92.6 295.2 319.1 299.7 ...
$ a_aro : int 0 6 15 15 12 11 6 12 12 6 ...
$ weight : num 156 151 366 383 326 ...
$ peoe_vsa.0 : num 76.9 38.2 58.1 62.2 74.8 ...
$ peoe_vsa.1 : num 43.4 25.5 124.7 124.7 118 ...
$ peoe_vsa.2 : num 0 0 21.7 13.2 33 ...
$ peoe_vsa.3 : num 0 8.62 8.62 21.79 0 ...
$ peoe_vsa.4 : num 0 23.3 17.4 0 0 ...
$ peoe_vsa.5 : num 0 0 0 0 0 0 0 0 0 0 ...
$ peoe_vsa.6 : num 17.24 0 8.62 8.62 8.62 ...
$ peoe_vsa.0.1 : num 18.7 49 83.8 83.8 83.8 ...
[list output truncated]
> head(bbbDescr)
tpsa nbasic negative vsa_hyd a_aro weight peoe_vsa.0 peoe_vsa.1 peoe_vsa.2
1 12.03 1 0 167.06700 0 156.293 76.94749 43.44619 0.00000
2 49.33 0 0 92.64243 6 151.165 38.24339 25.52006 0.00000
3 50.53 1 0 295.16700 15 366.485 58.05473 124.74020 21.65084
4 37.39 0 0 319.11220 15 382.552 62.23933 124.74020 13.19232
5 37.39 1 0 299.65800 12 326.464 74.80064 118.04060 33.00190
6 37.39 1 0 289.77770 11 332.492 74.80064 109.50990 13.19232
peoe_vsa.3 peoe_vsa.4 peoe_vsa.5 peoe_vsa.6 peoe_vsa.0.1 peoe_vsa.1.1 peoe_vsa.2.1
1 0.000000 0.00000 0 17.238030 18.74768 43.50657 0
2 8.619013 23.27370 0 0.000000 49.01962 0.00000 0
3 8.619013 17.44054 0 8.619013 83.82487 49.01962 0
4 21.785640 0.00000 0 8.619013 83.82487 68.78024 0
5 0.000000 0.00000 0 8.619013 83.82487 36.76471 0
6 21.785640 0.00000 0 8.619013 73.54603 44.27042 0
peoe_vsa.3.1 peoe_vsa.4.1 peoe_vsa.5.1 peoe_vsa.6.1 a_acc a_acid a_base vsa_acc
1 0 0.000000 0.000000 0.000000 0 0 1 0.000000
2 0 0.000000 13.566920 7.904431 2 0 0 13.566920
3 0 5.682576 2.503756 2.640647 2 0 1 8.186332
4 0 5.682576 0.000000 2.640647 2 0 1 8.186332
5 0 5.682576 0.136891 2.503756 2 0 1 8.186332
6 0 5.682576 0.000000 2.640647 2 0 1 8.186332
> str(logBBB)
num [1:208] 1.08 -0.4 0.22 0.14 0.69 0.44 -0.43 1.38 0.75 0.88 ...
> head(logBBB)
[1] 1.08 -0.40 0.22 0.14 0.69 0.44
( b)是否有某些预测变量服从退化的分布?
library(caret)
#nearZeroVar可诊断具有唯一值的预测变量(即零方差预测变量)或同时具有以下两个特征的预测变量:
#相对于样本数量,它们具有很少的唯一值;最常见值的频率与次最常见值的频率之比很大。
#nearZeroVar(x,freqCut = 95/5,uniqueCut = 10,saveMetrics = FALSE,names = FALSE,foreach = FALSE,allowParallel = TRUE)
#freqCut 最常见值与第二常见值之比的临界值,默认95/5
#uniqueCut 样本总数中不同值的百分比的临界值,默认10
near.zero.ind<-nearZeroVar(bbbDescr)
names(bbbDescr[near.zero.ind])
可见,有7个变量服从退化的分布 。建议从数据集中移除这些变量,这些变量会削弱一些模型,删除这些变量能显著提高模型的表现与稳定性(p31)
> names(bbbDescr[near.zero.ind])# "leaf.mild" "mycelium" "sclerotia"
[1] "negative" "peoe_vsa.2.1" "peoe_vsa.3.1" "a_acid" "vsa_acid"
[6] "frac.anion7." "alert"
( c)总体来看,预测变量间是否有强相关性?如果有,如何降低这种相关性?这对用于建模的预测变量数目是否会有很大影响?
分析数据,发现所有预测变量均为数值型,因此可以直接使用皮尔森相关性检验。
correlations<-cor(bbbDescr)#求相关系数 默认皮尔森
#求出相关系数大于0.75的预测变量
CalStrongCor<-function(x){
cor_result<-as.data.frame(x)
cor<-data.frame(col1=1,name1=2,col2=3,name2=4,cor=5)
for(i in 1:(ncol(cor_result)-1)){
for(j in (i+1):ncol(cor_result)){
if(abs(cor_result[i,j]>0.75)){
ci<-c(i,names(cor_result)[i],j,names(cor_result)[j],cor_result[i,j]) ;
cor<-rbind(cor,ci) ;
}
}
}
return (cor)
}
cor<-CalStrongCor(correlations)
head(cor[-1,])
nrow(cor[-1,])
计算发现,有254对变量的相关系数大于0.75
> head(cor[-1,])
col1 name1 col2 name2 cor
2 1 tpsa 21 a_acc 0.752469496802017
3 1 tpsa 48 tpsa.1 0.985797985082117
4 1 tpsa 57 nocount 0.92194304649877
5 1 tpsa 65 adistm 0.774514701022847
6 1 tpsa 67 polar_area 0.897182380302012
7 1 tpsa 103 dpsa3 0.860838530974004
> nrow(cor[-1,])
[1] 254
基于相关系数过滤变量
#基于相关系数过滤变量 p33
highCorr<-findCorrelation(correlations,cutoff=0.75) #返回建议删除变量的列数
highCorr
filteredBbbDescr<-bbbDescr[,-highCorr]
用该方法删除了66个变量
length(highCorr)
names(bbbDescr[highCorr])
> names(bbbDescr[highCorr])
[1] "vsa_don" "slogp_vsa2" "slogp_vsa7"
[4] "smr_vsa0" "smr_vsa5" "mw"
[7] "nocount" "hbdnr" "ub"
[10] "nonpolar_area" "tcnp" "ovality"
[13] "surface_area" "volume" "ppsa1"
[16] "ppsa2" "ppsa3" "pnsa2"
[19] "pnsa3" "fpsa2" "fnsa2"
[22] "fnsa3" "wpsa1" "wpsa2"
[25] "wpsa3" "wnsa1" "wnsa2"
[28] "wnsa3" "dpsa1" "dpsa2"
[31] "dpsa3" "sadh1" "sadh3"
[34] "chdh1" "chdh3" "scdh1"
[37] "scdh2" "scdh3" "saaa1"
[40] "saaa3" "scaa1" "scaa2"
[43] "scaa3" "ctdh" "ctaa"
[46] "mchg" "vsa_hyd" "tpsa"
[49] "a_acid" "a_base" "vsa_acc"
[52] "slogp_vsa3" "weight" "logp.o.w."
[55] "tpsa.1" "a_acc" "adistm"
[58] "polar_area" "psa_npsa" "homo"
[61] "sum_absolute_charge" "fpsa1" "fpsa3"
[64] "sadh2" "chaa1" "chdh2"