R语言决策树示例
建模造树
rm(list=ls())
library(kernlab)
data(spam)
data <- spam
id <- sample(1:4601,3000)
train <- data[id,]
test <- data[-id,]
library(tree)
library(rpart)
mdl=tree(type~., data=train,
split = c("deviance","gini"))
# 作图
plot(mdl)
text(mdl, pretty=0)
# 测试集的错误率
pred <- predict(mdl, test, type = "class")
mean(pred!=test$type)
# [1] 0.08869457
deviance(mdl) # 训练集偏差
# [1] 1327.724
修剪 prune
ptree <- prune.tree(mdl,best=4) # best对应着最底层节点数,即叶子数
plot(ptree)
text(ptree, pretty=0)
nodes <- as.numeric(rownames(ptree$frame))
max(rpart:::tree.depth(nodes)) # 修剪后树的高度为3
deviance(ptree) # 修剪后训练集偏差上升
# [1] 2142.959
# 将测试集带入修剪后的树
ptree_test <- predict(ptree,newdata=test,type="tree")
deviance(ptree_test)
# [1] 1185.11
ROC曲线
弧线右下面积越大,则模型越好。这里选用修剪叶数分别为4和6的两棵分类树作比较
dt5 <- as.data.frame(matrix(nrow = 19,ncol = 5))
dt5[,1] <- seq(.05,0.95,0.05)
for (i in 1:19) {
p1 <- predict(prune.tree(mdl,best=4), test, type="vector")
p2 <- predict(prune.tree(mdl,best=6), test, type="vector")
pred1 <- factor(p1[,2]>dt5[i,1],labels = c("nonspam","spam"),levels = c("FALSE","TRUE"))
pred2 <- factor(p2[,2]>dt5[i,1],labels = c("nonspam","spam"),levels = c("FALSE","TRUE"))
mat_51 <- table(test$type,pred1,dnn = c("Labels","prediction"))
mat_52 <- table(test$type,pred2,dnn = c("Labels","prediction"))
temp <-c(mat_51[2,2]/sum(mat_51[2,]), mat_51[1,2]/sum(mat_51[1,]),
mat_52[2,2]/sum(mat_52[2,]), mat_52[1,2]/sum(mat_52[1,]))
dt5[i,2:5] <- temp
}
names(dt5) <- c("pi","TPR", "FPR","TPR", "FPR")
dt55 <- rbind(dt5[,2:3],dt5[,4:5])
dt55$group <- c(rep("4 leaves",19),rep("6 leaves",19))
library(ggplot2)
ggplot(dt55, aes(x=FPR,y=TPR,color=group))+
geom_line()+
labs(title="ROC curves")
用交叉验证选择最佳叶子数
set.seed(12345)
cv.res=cv.tree(mdl)
plot(cv.res$size, cv.res$dev, type="b",
col="red")
# size 表示叶子数