# 6.R语言 分类回归树--决策树、随机森林

R语言 专栏收录该内容
79 篇文章 6 订阅

## 关注CSDN博客：程志伟的博客

1.回归树

data(prostate)
prostate$gleason <- ifelse(prostate$gleason == 6, 0, 1)
pros.train <- subset(prostate, train == TRUE)[, 1:9]
pros.test = subset(prostate, train == FALSE)[, 1:9]

set.seed(123)
tree.pros <- rpart(lpsa ~ ., data = pros.train)
tree.pros$cptable > tree.pros$cptable
CP nsplit rel error    xerror      xstd
1 0.35852251      0 1.0000000 1.0364016 0.1822698
2 0.12295687      1 0.6414775 0.8395071 0.1214181
3 0.11639953      2 0.5185206 0.7255295 0.1015424
4 0.05350873      3 0.4021211 0.7608289 0.1109777
5 0.01032838      4 0.3486124 0.6911426 0.1061507
6 0.01000000      5 0.3382840 0.7102030 0.1093327

CP的第一列是成本复杂性参数，第二列是树的分裂次数，第三列是相对误差，第四列是平均误差，第五列是交叉验证的标准差。

plotcp(tree.pros)查看统计图，可以看出在第4次分裂时数据的误差是最小的。

cp <- min(tree.pros$cptable[5, ]) prune.tree.pros <- prune(tree.pros, cp = cp) plot(as.party(prune.tree.pros)) party.pros.test <- predict(prune.tree.pros, newdata = pros.test) rpart.resid <- party.pros.test - pros.test$lpsa #calculate residual
mean(rpart.resid^2)

[1] 0.5267748

2.分类树

> data(biopsy)
> biopsy <- biopsy[, -1]
> names(biopsy) <- c("thick", "u.size", "u.shape", "adhsn", "s.size", "nucl", "chrom", "n.nuc", "mit", "class")
> biopsy.v2 <- na.omit(biopsy)
> set.seed(123) #random number generator
> ind <- sample(2, nrow(biopsy.v2), replace = TRUE, prob = c(0.7, 0.3))
> biop.train <- biopsy.v2[ind == 1, ] #the training data set
> biop.test <- biopsy.v2[ind == 2, ] #the test data set
> str(biop.test)
'data.frame':    209 obs. of  10 variables:
$thick : int 5 6 4 2 1 7 6 7 1 3 ...$ u.size : int  4 8 1 1 1 4 1 3 1 2 ...
$u.shape: int 4 8 1 2 1 6 1 2 1 1 ...$ adhsn  : int  5 1 3 1 1 4 1 10 1 1 ...
$s.size : int 7 3 2 2 1 6 2 5 2 1 ...$ nucl   : int  10 4 1 1 1 1 1 10 1 1 ...
$chrom : int 3 3 3 3 3 4 3 5 3 2 ...$ n.nuc  : int  2 7 1 1 1 3 1 4 1 1 ...
$mit : int 1 1 1 1 1 1 1 4 1 1 ...$ class  : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 2 1 1 ...
- attr(*, "na.action")= 'omit' Named int  24 41 140 146 159 165 236 250 276 293 ...
..- attr(*, "names")= chr  "24" "41" "140" "146" ...
> set.seed(123)
> tree.biop <- rpart(class ~ ., data = biop.train)
> tree.biop$cptable CP nsplit rel error xerror xstd 1 0.79651163 0 1.0000000 1.0000000 0.06086254 2 0.07558140 1 0.2034884 0.2674419 0.03746996 3 0.01162791 2 0.1279070 0.1453488 0.02829278 4 0.01000000 3 0.1162791 0.1744186 0.03082013 > cp <- min(tree.biop$cptable[3, ])
> prune.tree.biop = prune(tree.biop, cp = cp)
> # plot(as.party(tree.biop))
> plot(as.party(prune.tree.biop))
> rparty.test <- predict(prune.tree.biop, newdata = biop.test,
+                        type = "class")
> table(rparty.test, biop.testclass) rparty.test benign malignant benign 136 3 malignant 6 64 > (136+64)/209 [1] 0.9569378 3.随机森林回归 > set.seed(123) > rf.pros <- randomForest(lpsa ~ ., data = pros.train) > rf.pros Call: randomForest(formula = lpsa ~ ., data = pros.train) Type of random forest: regression Number of trees: 500 No. of variables tried at each split: 2 Mean of squared residuals: 0.6936697 % Var explained: 51.73 随机森林生成了500个树，每次分裂时随机抽取两个变量。ESM为0.69，差不多52%的方差得到解释。 > plot(rf.pros) > which.min(rf.prosmse)
[1] 80
> set.seed(123)
> rf.pros.2 <- randomForest(lpsa ~ ., data = pros.train, ntree = 80)
> rf.pros.2

Call:
randomForest(formula = lpsa ~ ., data = pros.train, ntree = 80)
Type of random forest: regression
Number of trees: 80
No. of variables tried at each split: 2

Mean of squared residuals: 0.6566502
% Var explained: 54.31
> varImpPlot(rf.pros.2, scale = TRUE,
+            main = "Variable Importance Plot - PSA Score")

> importance(rf.pros.2)
IncNodePurity
lcavol      25.011557
lweight     15.822110
age          7.167320
lbph         5.471032
svi          8.497838
lcp          8.113947
gleason      4.990213
pgg45        6.663911
> rf.pros.test <- predict(rf.pros.2, newdata = pros.test)
> #plot(rf.pros.test, pros.test$lpsa) > rf.resid <- rf.pros.test - pros.test$lpsa #calculate residual
> mean(rf.resid^2)
[1] 0.5512549

4.随机森林分类

> set.seed(123)
> rf.biop <- randomForest(class ~ ., data = biop.train)
> rf.biop

Call:
randomForest(formula = class ~ ., data = biop.train)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3

OOB estimate of  error rate: 3.38%
Confusion matrix:
benign malignant class.error
benign       294         8  0.02649007
malignant      8       164  0.04651163
> plot(rf.biop)

> which.min(rf.bioperr.rate[, 1]) [1] 125 > set.seed(123) > rf.biop.2 <- randomForest(class ~ ., data = biop.train, ntree = 125) > #getTree(rf.biop,1) > rf.biop.2 Call: randomForest(formula = class ~ ., data = biop.train, ntree = 125) Type of random forest: classification Number of trees: 125 No. of variables tried at each split: 3 OOB estimate of error rate: 2.95% Confusion matrix: benign malignant class.error benign 294 8 0.02649007 malignant 6 166 0.03488372 > rf.biop.test <- predict(rf.biop.2, + newdata = biop.test, + type = "response") > table(rf.biop.test, biop.testclass)

rf.biop.test benign malignant
benign       138         0
malignant      4        67
> (138 + 67) / 209
[1] 0.9808612
> varImpPlot(rf.biop.2)

data(Pima.tr)
data(Pima.te)
pima <- rbind(Pima.tr, Pima.te)

> set.seed(123)
> ind <- sample(2, nrow(pima), replace = TRUE, prob = c(0.7, 0.3))
> pima.train <- pima[ind == 1, ]
> pima.test <- pima[ind == 2, ]
> set.seed(123)
> rf.pima <- randomForest(type ~ ., data = pima.train)
> rf.pima

Call:
randomForest(formula = type ~ ., data = pima.train)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2

OOB estimate of  error rate: 22.57%
Confusion matrix:
No Yes class.error
No  226  30   0.1171875
Yes  56  69   0.4480000
> # plot(rf.pima)
> which.min(rf.pima$err.rate[,1]) [1] 244 > rf.pima.2 <- randomForest(type ~ ., data = pima.train, ntree = 244) > rf.pima.2 Call: randomForest(formula = type ~ ., data = pima.train, ntree = 244) Type of random forest: classification Number of trees: 244 No. of variables tried at each split: 2 OOB estimate of error rate: 23.62% Confusion matrix: No Yes class.error No 223 33 0.1289062 Yes 57 68 0.4560000 > rf.pima.test <- predict(rf.pima.2, + newdata = pima.test, + type = "response") > table(rf.pima.test, pima.test$type)

rf.pima.test No Yes
No  85  16
Yes 14  36
> (85+36)/(85+16+14+36)
[1] 0.8013245

• 0
点赞
• 0
评论
• 25
收藏
• 一键三连
• 扫一扫，分享海报

02-17
04-07 7959

02-19 346
03-03 3000
12-27 3150
12-26 1007
11-04 1万+
07-26 4148
02-27 2413
06-04 7907
05-20 5572
03-02
06-10 4324
01-26 352
11-29 3110

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。