rm(list=ls())
library(Matrix)
library(glmnet)
library(survival)
library(ISLR)
library(MASS)
library(class)
library(caret)
library(gbm)
library(ISLR)
library(nnet)
library(pROC)
library(randomForest)
library(tree)
# Question 1
rm(list=ls())
M=100
set.seed(4061)
dat=iris[sample(1:nrow(iris)),]
dat[,1:4]=apply(dat[,1:4],2,scale)
itrain=sample(1:nrow(iris),M)
# (a)
# grow a classification tree from the training set using the Gini index as splitting criterion
# quote the number of terminal nodes and the misclassification error rate for the full tree
class(dat)
tree.mod=tree(Species~.,data=dat[itrain,],split='gini')
summary(tree.mod)$used
summary(tree.mod)
# (b)
# consider the lassification tree obtained in Figure 1 for a similar training set
# which variables were found to be useful for classification of iris specimens, based on this tree?
par(font=2,mar=c(1,1,1,1))
plot(tree.mod,col='navy')
text(tree.mod,pretty=NULL)
# (c)
par(mfrow=c(2,2))
for(k in 1:4){
boxplot(dat[,k]~dat[,5],col='pink',main=paste(names(dat)[k]))
}
# (d) based on this boxplot, comment on your findings in (b)
# the Petal information is more clearly separated per species
# (e) compute predictions for the test set based on the tree grown in (a), and provide:
# (i) the corresponding confusion table;
# (ii) the prediction error rate
tree.pred=predict(tree.mod,dat[-itrain,],type='class')
tb.tree=table(tree.pred,dat[-itrain,5])
1-sum(diag(tb.tree))/sum(tb.tree)
# (f)
# we now consider pruning the classification tree obtained in (a),
# based on cross-validated misclassification error.
# what is the optimal tree size for pruning? explain your answer.修剪的最佳树大小是多