关注微信公共号:小程在线
关注CSDN博客:程志伟的博客
1.回归树
data(prostate)
prostate$gleason <- ifelse(prostate$gleason == 6, 0, 1)
pros.train <- subset(prostate, train == TRUE)[, 1:9]
pros.test = subset(prostate, train == FALSE)[, 1:9]
set.seed(123)
tree.pros <- rpart(lpsa ~ ., data = pros.train)
tree.pros$cptable
> tree.pros$cptable
CP nsplit rel error xerror xstd
1 0.35852251 0 1.0000000 1.0364016 0.1822698
2 0.12295687 1 0.6414775 0.8395071 0.1214181
3 0.11639953 2 0.5185206 0.7255295 0.1015424
4 0.05350873 3 0.4021211 0.7608289 0.1109777
5 0.01032838 4 0.3486124 0.6911426 0.1061507
6 0.01000000 5 0.3382840 0.7102030 0.1093327
CP的第一列是成本复杂性参数,第二列是树的分裂次数,第三列是相对误差,第四列是平均误差,第五列是交叉验证的标准差。
plotcp(tree.pros)查看统计图,可以看出在第4次分裂时数据的误差是最小的。
cp <- min(tree.pros$cptable[5, ])
prune.tree.pros <- prune(tree.pros, cp = cp)
plot(as.party(prune.tree.pros))
party.pros.test <- predict(prune.tree.pros,
newdata = pros.test)
rpart.resid <- party.pros.test - pros.test$lpsa #calculate residual
mean(rpart.resid^2)
[1] 0.5267748
2.分类树
> data(biopsy)
> biopsy <- biopsy[, -1]
> names(biopsy) <- c("thick", "u.size", "u.shape", "adhsn", "s.size", "nucl", "chrom", "n.nuc", "mit", "class")
> biopsy.v2 <- na.omit(biopsy)
> set.seed(123) #random number generator
> ind <- sample(2, nrow(biopsy.v2), replace = TRUE, prob = c(0.7, 0.3))
> biop.train <- biopsy.v2[ind == 1, ] #the training data set
> biop.test <- biopsy.v2[ind == 2, ] #the test data set
> str(biop.test)
'data.frame': 209 obs. of 10 variables:
$ thick : int 5 6 4 2 1 7 6 7 1 3 ...
$ u.size : int 4 8 1 1 1 4 1 3 1 2 ...
$ u.shape: int 4 8 1 2 1 6 1 2 1 1 ...
$ adhsn : int 5 1 3 1 1 4 1 10 1 1 ...
$ s.size : int 7 3 2 2 1 6 2 5 2 1 ...
$ nucl : int 10 4 1 1 1 1 1 10 1 1 ...
$ chrom : int 3 3 3 3 3 4 3 5 3 2 ...
$ n.nuc : int 2 7 1 1 1 3 1 4 1 1 ...
$ mit : int 1 1 1 1 1 1 1 4 1 1 ...
$ class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 2 1 1 ...
- attr(*, "na.action")= 'omit' Named int 24 41 140 146 159 165 236 250 276 293 ...
..- attr(*, "names")= chr "24" "41" "140" "146" ...
> set.seed(123)
> tree.biop <- rpart(class ~ ., data = biop.train)
> tree.biop$cptable
CP nsplit rel error xerror xstd
1 0.79651163 0 1.0000000 1.0000000 0.06086254
2 0.07558140 1 0.2034884 0.2674419 0.03746996
3 0.01162791 2 0.1279070 0.1453488 0.02829278
4 0.01000000 3 0.1162791 0.1744186 0.03082013
> cp <- min(tree.biop$cptable[3, ])
> prune.tree.biop = prune(tree.biop, cp = cp)
> # plot(as.party(tree.biop))
> plot(as.party(prune.tree.biop))
> rparty.test <- predict(prune.tree.biop, newdata = biop.test,
+ type = "class")
> table(rparty.test, biop.test$class)
rparty.test benign malignant
benign 136 3
malignant 6 64
> (136+64)/209
[1] 0.9569378
3.随机森林回归
> set.seed(123)
> rf.pros <- randomForest(lpsa ~ ., data = pros.train)
> rf.pros
Call:
randomForest(formula = lpsa ~ ., data = pros.train)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 2
Mean of squared residuals: 0.6936697
% Var explained: 51.73
随机森林生成了500个树,每次分裂时随机抽取两个变量。ESM为0.69,差不多52%的方差得到解释。
> plot(rf.pros)
> which.min(rf.pros$mse)
[1] 80
> set.seed(123)
> rf.pros.2 <- randomForest(lpsa ~ ., data = pros.train, ntree = 80)
> rf.pros.2
Call:
randomForest(formula = lpsa ~ ., data = pros.train, ntree = 80)
Type of random forest: regression
Number of trees: 80
No. of variables tried at each split: 2
Mean of squared residuals: 0.6566502
% Var explained: 54.31
> varImpPlot(rf.pros.2, scale = TRUE,
+ main = "Variable Importance Plot - PSA Score")
> importance(rf.pros.2)
IncNodePurity
lcavol 25.011557
lweight 15.822110
age 7.167320
lbph 5.471032
svi 8.497838
lcp 8.113947
gleason 4.990213
pgg45 6.663911
> rf.pros.test <- predict(rf.pros.2, newdata = pros.test)
> #plot(rf.pros.test, pros.test$lpsa)
> rf.resid <- rf.pros.test - pros.test$lpsa #calculate residual
> mean(rf.resid^2)
[1] 0.5512549
4.随机森林分类
乳腺癌数据为例
> set.seed(123)
> rf.biop <- randomForest(class ~ ., data = biop.train)
> rf.biop
Call:
randomForest(formula = class ~ ., data = biop.train)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 3.38%
Confusion matrix:
benign malignant class.error
benign 294 8 0.02649007
malignant 8 164 0.04651163
> plot(rf.biop)
> which.min(rf.biop$err.rate[, 1])
[1] 125
> set.seed(123)
> rf.biop.2 <- randomForest(class ~ ., data = biop.train, ntree = 125)
> #getTree(rf.biop,1)
> rf.biop.2
Call:
randomForest(formula = class ~ ., data = biop.train, ntree = 125)
Type of random forest: classification
Number of trees: 125
No. of variables tried at each split: 3
OOB estimate of error rate: 2.95%
Confusion matrix:
benign malignant class.error
benign 294 8 0.02649007
malignant 6 166 0.03488372
> rf.biop.test <- predict(rf.biop.2,
+ newdata = biop.test,
+ type = "response")
> table(rf.biop.test, biop.test$class)
rf.biop.test benign malignant
benign 138 0
malignant 4 67
> (138 + 67) / 209
[1] 0.9808612
> varImpPlot(rf.biop.2)
印第安人糖尿病数据
data(Pima.tr)
data(Pima.te)
pima <- rbind(Pima.tr, Pima.te)
> set.seed(123)
> ind <- sample(2, nrow(pima), replace = TRUE, prob = c(0.7, 0.3))
> pima.train <- pima[ind == 1, ]
> pima.test <- pima[ind == 2, ]
> set.seed(123)
> rf.pima <- randomForest(type ~ ., data = pima.train)
> rf.pima
Call:
randomForest(formula = type ~ ., data = pima.train)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 22.57%
Confusion matrix:
No Yes class.error
No 226 30 0.1171875
Yes 56 69 0.4480000
> # plot(rf.pima)
> which.min(rf.pima$err.rate[,1])
[1] 244
> rf.pima.2 <- randomForest(type ~ ., data = pima.train, ntree = 244)
> rf.pima.2
Call:
randomForest(formula = type ~ ., data = pima.train, ntree = 244)
Type of random forest: classification
Number of trees: 244
No. of variables tried at each split: 2
OOB estimate of error rate: 23.62%
Confusion matrix:
No Yes class.error
No 223 33 0.1289062
Yes 57 68 0.4560000
> rf.pima.test <- predict(rf.pima.2,
+ newdata = pima.test,
+ type = "response")
> table(rf.pima.test, pima.test$type)
rf.pima.test No Yes
No 85 16
Yes 14 36
> (85+36)/(85+16+14+36)
[1] 0.8013245