6.R语言 分类回归树--决策树、随机森林

R语言 专栏收录该内容
79 篇文章 6 订阅

关注微信公共号:小程在线

关注CSDN博客:程志伟的博客

1.回归树

data(prostate)
prostate$gleason <- ifelse(prostate$gleason == 6, 0, 1)
pros.train <- subset(prostate, train == TRUE)[, 1:9]
pros.test = subset(prostate, train == FALSE)[, 1:9]

set.seed(123)
tree.pros <- rpart(lpsa ~ ., data = pros.train)
tree.pros$cptable
> tree.pros$cptable
          CP nsplit rel error    xerror      xstd
1 0.35852251      0 1.0000000 1.0364016 0.1822698
2 0.12295687      1 0.6414775 0.8395071 0.1214181
3 0.11639953      2 0.5185206 0.7255295 0.1015424
4 0.05350873      3 0.4021211 0.7608289 0.1109777
5 0.01032838      4 0.3486124 0.6911426 0.1061507
6 0.01000000      5 0.3382840 0.7102030 0.1093327

CP的第一列是成本复杂性参数,第二列是树的分裂次数,第三列是相对误差,第四列是平均误差,第五列是交叉验证的标准差。

plotcp(tree.pros)查看统计图,可以看出在第4次分裂时数据的误差是最小的。

cp <- min(tree.pros$cptable[5, ])
prune.tree.pros <- prune(tree.pros, cp = cp)
plot(as.party(prune.tree.pros))

party.pros.test <- predict(prune.tree.pros, 
                           newdata = pros.test)
rpart.resid <- party.pros.test - pros.test$lpsa #calculate residual
mean(rpart.resid^2)

[1] 0.5267748

2.分类树

> data(biopsy)
> biopsy <- biopsy[, -1]
> names(biopsy) <- c("thick", "u.size", "u.shape", "adhsn", "s.size", "nucl", "chrom", "n.nuc", "mit", "class")
> biopsy.v2 <- na.omit(biopsy)
> set.seed(123) #random number generator
> ind <- sample(2, nrow(biopsy.v2), replace = TRUE, prob = c(0.7, 0.3))
> biop.train <- biopsy.v2[ind == 1, ] #the training data set
> biop.test <- biopsy.v2[ind == 2, ] #the test data set
> str(biop.test)
'data.frame':    209 obs. of  10 variables:
 $ thick  : int  5 6 4 2 1 7 6 7 1 3 ...
 $ u.size : int  4 8 1 1 1 4 1 3 1 2 ...
 $ u.shape: int  4 8 1 2 1 6 1 2 1 1 ...
 $ adhsn  : int  5 1 3 1 1 4 1 10 1 1 ...
 $ s.size : int  7 3 2 2 1 6 2 5 2 1 ...
 $ nucl   : int  10 4 1 1 1 1 1 10 1 1 ...
 $ chrom  : int  3 3 3 3 3 4 3 5 3 2 ...
 $ n.nuc  : int  2 7 1 1 1 3 1 4 1 1 ...
 $ mit    : int  1 1 1 1 1 1 1 4 1 1 ...
 $ class  : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 2 1 1 ...
 - attr(*, "na.action")= 'omit' Named int  24 41 140 146 159 165 236 250 276 293 ...
  ..- attr(*, "names")= chr  "24" "41" "140" "146" ...
> set.seed(123)
> tree.biop <- rpart(class ~ ., data = biop.train)
> tree.biop$cptable
          CP nsplit rel error    xerror       xstd
1 0.79651163      0 1.0000000 1.0000000 0.06086254
2 0.07558140      1 0.2034884 0.2674419 0.03746996
3 0.01162791      2 0.1279070 0.1453488 0.02829278
4 0.01000000      3 0.1162791 0.1744186 0.03082013
> cp <- min(tree.biop$cptable[3, ])
> prune.tree.biop = prune(tree.biop, cp = cp)
> # plot(as.party(tree.biop))
> plot(as.party(prune.tree.biop))
> rparty.test <- predict(prune.tree.biop, newdata = biop.test,
+                        type = "class")
> table(rparty.test, biop.test$class)
           
rparty.test benign malignant
  benign       136         3
  malignant      6        64
> (136+64)/209
[1] 0.9569378

3.随机森林回归

> set.seed(123)
> rf.pros <- randomForest(lpsa ~ ., data = pros.train)
> rf.pros

Call:
 randomForest(formula = lpsa ~ ., data = pros.train) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 2

          Mean of squared residuals: 0.6936697
                    % Var explained: 51.73

随机森林生成了500个树,每次分裂时随机抽取两个变量。ESM为0.69,差不多52%的方差得到解释。
> plot(rf.pros)


> which.min(rf.pros$mse)
[1] 80
> set.seed(123)
> rf.pros.2 <- randomForest(lpsa ~ ., data = pros.train, ntree = 80)
> rf.pros.2

Call:
 randomForest(formula = lpsa ~ ., data = pros.train, ntree = 80) 
               Type of random forest: regression
                     Number of trees: 80
No. of variables tried at each split: 2

          Mean of squared residuals: 0.6566502
                    % Var explained: 54.31
> varImpPlot(rf.pros.2, scale = TRUE,
+            main = "Variable Importance Plot - PSA Score")


> importance(rf.pros.2)
        IncNodePurity
lcavol      25.011557
lweight     15.822110
age          7.167320
lbph         5.471032
svi          8.497838
lcp          8.113947
gleason      4.990213
pgg45        6.663911
> rf.pros.test <- predict(rf.pros.2, newdata = pros.test)
> #plot(rf.pros.test, pros.test$lpsa)
> rf.resid <- rf.pros.test - pros.test$lpsa #calculate residual
> mean(rf.resid^2)
[1] 0.5512549

4.随机森林分类

乳腺癌数据为例

> set.seed(123)
> rf.biop <- randomForest(class ~ ., data = biop.train)
> rf.biop

Call:
 randomForest(formula = class ~ ., data = biop.train) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 3.38%
Confusion matrix:
          benign malignant class.error
benign       294         8  0.02649007
malignant      8       164  0.04651163
> plot(rf.biop)


> which.min(rf.biop$err.rate[, 1])
[1] 125
> set.seed(123)
> rf.biop.2 <- randomForest(class ~ ., data = biop.train, ntree = 125)
> #getTree(rf.biop,1)
> rf.biop.2

Call:
 randomForest(formula = class ~ ., data = biop.train, ntree = 125) 
               Type of random forest: classification
                     Number of trees: 125
No. of variables tried at each split: 3

        OOB estimate of  error rate: 2.95%
Confusion matrix:
          benign malignant class.error
benign       294         8  0.02649007
malignant      6       166  0.03488372
> rf.biop.test <- predict(rf.biop.2, 
+                         newdata = biop.test, 
+                         type = "response")
> table(rf.biop.test, biop.test$class)
            
rf.biop.test benign malignant
   benign       138         0
   malignant      4        67
> (138 + 67) / 209
[1] 0.9808612
> varImpPlot(rf.biop.2)

 

印第安人糖尿病数据

data(Pima.tr)
data(Pima.te)
pima <- rbind(Pima.tr, Pima.te)

> set.seed(123)
> ind <- sample(2, nrow(pima), replace = TRUE, prob = c(0.7, 0.3))
> pima.train <- pima[ind == 1, ]
> pima.test <- pima[ind == 2, ]
> set.seed(123)
> rf.pima <- randomForest(type ~ ., data = pima.train)
> rf.pima

Call:
 randomForest(formula = type ~ ., data = pima.train) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 22.57%
Confusion matrix:
     No Yes class.error
No  226  30   0.1171875
Yes  56  69   0.4480000
> # plot(rf.pima)
> which.min(rf.pima$err.rate[,1])
[1] 244
> rf.pima.2 <- randomForest(type ~ ., data = pima.train, ntree = 244)
> rf.pima.2

Call:
 randomForest(formula = type ~ ., data = pima.train, ntree = 244) 
               Type of random forest: classification
                     Number of trees: 244
No. of variables tried at each split: 2

        OOB estimate of  error rate: 23.62%
Confusion matrix:
     No Yes class.error
No  223  33   0.1289062
Yes  57  68   0.4560000
> rf.pima.test <- predict(rf.pima.2, 
+                         newdata = pima.test, 
+                         type = "response")
> table(rf.pima.test, pima.test$type)
            
rf.pima.test No Yes
         No  85  16
         Yes 14  36
> (85+36)/(85+16+14+36)
[1] 0.8013245

 

  • 0
    点赞
  • 0
    评论
  • 25
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

©️2021 CSDN 皮肤主题: 撸撸猫 设计师:马嘣嘣 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值