决策树模型的构建包括特征属性选择与剪枝处理,基于 CART 算法的优点,并结合本研究数据情况,故特征选择属性以基尼指数为准,以 26个自变量作为解释变量, 是否并发间质性肺疾病作为结局变量,利用 rpart 包进行模型的初建。
ibrary(skimr)
skim(wdbc)#查看数据/wdbc为表格名称,直接替换即可
library(DataExplorer)
plot_missing(wdbc)#查看缺失情况
library(ggplot2)
library(lattice)
library(caret)
table(wdbc$label)#label为分组变量
set.seed(42)
library(ggplot2)
library(lattice)
library(caret)
trains <- createDataPartition(y=wdbc$label,p=0.75,list=F)
traindata <- wdbc[trains,]
testdata <- wdbc[-trains,]
table(traindata$label)
table(testdata$label)
colnames(wdbc)
form_cls <- as.formula(
paste0(
"label~",
paste(colnames(traindata)[2:27],collapse="+")
)
)
form_cls
library(rpart)
library(rpart.plot)
set.seed(42)
fit_dt_cls <- rpart(
form_cls,
data=traindata,
method="class",
parms=list(split="gini"),
control=rpart.control(cp=0.001)
)#cp(complexity parameter)用于控制决策 树的复杂度,过高的复杂度会导致过拟合,默认设置为 0.01
fit_dt_cls
printcp(fit_dt_cls)#初始模型的 cp 值可通 过 printcp 函数得到
plotcp(fit_dt_cls,upper="splits")#可视化图可通过 plotcp 展示
初始模型CP值情况如下图
复杂度参数及预测误差图如下图
library(haven)
ggplot(varimpdata,
aes(x=as_factor(rownames(varimpdata)),y=importance))+
geom_col()+
labs(x="variables")+
theme_classic()+
theme(axis.text.x=element_text(angle=15,hjust=1))
library(rpart)
library(rpart.plot)
prp(fit_dt_cls_pruned,
type=5,
extra=104,
tweak=1,
fallen.leaves=TRUE,
main="Decision Tree")
生成决策树见下图
library(robustbase)
library(rrcov)
library(pROC)
trainpredprob <- predict(fit_dt_cls_pruned,newdata=traindata,type="prob")
trainroc <- roc(response=traindata$label,predictor=trainpredprob[,2])
plot(trainroc,
print.auc=TRUE,
auc.polygon=TRUE,
grid=T,
max.auc.polygon=T,
auc.polygon.col="skyblue",
print.thres=T,
legacy.axes=T,
bty="l")
生成ROC曲线,见下图:
bestp <- trainroc$thresholds[
which.max(trainroc$sensitivities+trainroc$specificities-1)
]
bestp
trainpredlab <- as.factor(ifelse(trainpredprob[,1]>bestp,"0","1"))
library(caret)
confusionMatrix(trainpredlab , factor(traindata$label), positive = "0",mode="everything")
testpredprob <- predict(fit_dt_cls_pruned,newdata=testdata,type="prob")
testpredlab <- as.factor(ifelse(testpredprob[,1]>bestp,"0","1"))
confusionMatrix(testpredlab , factor(testdata$label), positive = "0",mode="everything")
testroc <- roc(response=testdata$label,predictor=testpredprob[,2])
plot(trainroc,
print.auc=TRUE,
grid=c(0.1,0.2),
auc.polygon=F,
max.auc.polygon=T,
main="ROC",
grid.col=c("green","red"))
plot(testroc,
print.auc=TRUE,
print.auc.y=0.4,
add=T,
col="red")
legend("bottomright",
legend=c("traindata","testdata"),
col=c(par("fg"),"red"),
lwd=2,
cex=0.9)
生成预测的ROC曲线,见下图:
bestp <- trainroc$thresholds[
which.max(trainroc$sensitivities+trainroc$specificities-1)
]
bestp
trainpredlab <- as.factor(ifelse(trainpredprob[,1]>bestp,"0","1"))
library(caret)
confusionMatrix(trainpredlab , factor(traindata$label), positive = "0",mode="everything")
生成
准确度 64.49%,召回率 65.96%,精确率57.14%,F1 值为 59.50%,AUC值为 64.30%