20-21重制版

TEST 2020-21

# Question 1

# Question 2

dat=read.csv(file = 'dodgysales.csv',stringsAsFactors = TRUE)

n = nrow(dat)

set.seed(6041)

i.train = sample(1:n, floor(.7*n))

dat.train = dat[i.train,]

dat.validation = dat[-i.train,]

#(a)

# Is this a regression or a classification problem?

# regression

#(b)

# Quote the number P of predictors present in this dataset.

ncol(dat)-1

# (c)

# Create a scaled copy dat.s of dataset dat, using min-max normalisation

(apply this scaling to the response variable also).

#(i)

(i)

# Quote the 5-number summaries of dat.s$Sales, dat.s$BudgOp and

# provide the frequency distribution table for dat.s$Training

minmax.scale <- function(x){

 if(!is.factor(x)){

   xs=(x-min(x))/(max(x)-min(x))

 }

  else{

    xs=x

  }

  return(xs)

}

dat.s=as.data.frame(lapply(dat,minmax.scale))

summary(dat.s$Sales)

summary(dat.s$BudgOp)

summary(dat.s$Training)

#(ii)fitted value

dat.s.train=dat.s[i.train,]

dat.s.validation= dat.s[-i.train,]

# Split the scaled data dat.s into training and validation subsets

# Fit two single-layer feed-forward neural networks, using respectively 3

# and 8 neurons in the hidden layer. Use the nnet library to do this and

# set the random seed to 6041 (set.seed(6041)) before performing any

# model fit. Quote the corresponding training Mean Squared Errors (MSEs)

set.seed(6041)

nn3=nnet(Sales~.,data = dat.s.train,size=3)

set.seed(6041)

nn8=nnet(Sales~.,data = dat.s.train,size=8)

mean(nn3$residuals^2)

mean(nn8$residuals^2)

#(iii)

# Generate predictions for the validation set dat.s.validation from

# each of the neural networks trained in (ii). Quote the corresponding  

# validation MSEs.

# predict for validation data

y.test=dat.s.validation$Sales

p3= predict(nn3,dat.s.validation)

p8= predict(nn8,dat.s.validation)

mean((p3-y.test)^2)

mean((p8-y.test)^2)

# (iv)

# Suggest an explanation for the difference between the training and

# validation errors for each of these neural networks.

# we only do this case once

# we don’t have an idea about the distribution error, this comparison is very limited

# (d)

# Set random seed to 6041 (set.seed(6041)) and fit a gradient boosting model

# to the training data dat.train, using package gbm, and using 100 weak

# learners for this ensemble. Quote the corresponding

# training and validation MSEs.

set.seed(6041)

gbmo = gbm(Sales~.,data = dat.train,distribution = 'gaussian',

           n.trees = 100)

gbmp= predict(gbmo,dat.validation)

mean((gbmo$fit-dat.train$Sales)^2)

mean((gbmp-dat.validation$Sales)^2)

# (e)   

# Set random seed to 6041 (set.seed(6041)) and

#fit a generalized linear regression model to the training data dat.train.

# Quote the corresponding training and validation MSEs.

set.seed(6041)

glmo = glm(Sales~.,data = dat.train,family = 'gaussian')

glmp= predict(glmo,dat.validation)

mean((glmo$fit-dat.train$Sales)^2)

mean((glmp-dat.validation$Sales)^2)

# (f)

# Set random seed to 6041 (set.seed(6041)) and

# fit a ridge regression model to the training data dat.train.

# Quote the corresponding training and validation MSEs.

set.seed(6041)

xm= model.matrix(Sales~.+0,data = dat.train)

y = dat.train$Sales

lasso.cv = cv.glmnet(xm, y, family="gaussian")

ridge = glmnet(xm,y,alpha=0,lambda=lasso.cv$lambda.min)

ridge.fit= predict(ridge,xm)

xmv= model.matrix(Sales~.+0,data = dat.validation)

ridgep= predict(ridge,xmv)

mean((ridge.fit-dat.train$Sales)^2)

mean((ridgep-dat.validation$Sales)^2)

# (g) you need to rescaled the data

# Compare and comment on the validation errors obtained from the

# neural networks and from ridge regression.

set.seed(6041)

nnC = trainControl(method="cv", number=10,)

nno = train(Sales~.,data = dat.s.train,  method="nnet",

            trControl=nnC)

names(nno)

nno$bestTune

nno.fit = predict(nno,dat.s.train)

nno.res = nno.fit-dat.s.train$Sales

mean(nno.res^2)

po=predict(nno,dat.s.validation)

mean((po-y.test)^2)

# ???

pov=po*(max(dat$Sales)-min(dat$Sales))+min(dat$Sales)

mean((pov-dat.validation$Sales)^2)

# Question 3

# Wisconsin Breast Cancer Database

data(BreastCancer)

dim(BreastCancer)

# set up the data

dat=na.omit(BreastCancer)

dat$Id=NULL

n=nrow(dat)

i.train=sample(1:n,600,replace=F)

dat.train=dat[i.train,]

dat.validation=dat[-i.train,]

# (a)

# Set random seed to 4061 (set.seed(4061)) and fit a random forest model to

# the training set, performing a simple 10-fold cross-validation for training.

# Obtain predictions from this model for the validation set dat.validation.

# (i) Quote the number of variables used at each split.

# (ii) Provide the test set prediction accuracy achieved with this model.

# random forest

set.seed(4061)

rf.Control=trainControl(method='cv',number=10)

rf.out=caret::train(Class~.,trControl=rf.Control,data=dat.train,method='rf')

rf.pred=predict(rf.out,dat.validation)

rf.cm=confusionMatrix(reference=dat.validation$Class,data=rf.pred)

# SVM(linear)

set.seed(4061)

svm.Control=trainControl(method='cv',number=10)

svm.out=caret::train(Class~.,trControl=svm.Control,data=dat.train,method='svmLinear')

svm.pred=predict(svm.out,dat.validation)

svm.cm=confusionMatrix(reference=dat.validation$Class,data=svm.pred)

# SVM(radial)

set.seed(4061)

svmR.Control=trainControl(method='cv',number=10)

svmR.out=caret::train(Class~.,trControl=svmR.Control,data=dat.train,method='svmRadial')

svmR.pred=predict(svmR.out,dat.validation)

svmR.cm=confusionMatrix(reference=dat.validation$Class,data=svmR.pred)

# which model is deemed better?

round(cbind(rf.cm$overall,svm.cm$overall,svmR.cm$overall),3)

# None, really, as CI's around accuracies are comparable

# variable importance

varImp(rf.out)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值