TEST 2020-21
# Question 1
# Question 2
dat=read.csv(file = 'dodgysales.csv',stringsAsFactors = TRUE)
n = nrow(dat)
set.seed(6041)
i.train = sample(1:n, floor(.7*n))
dat.train = dat[i.train,]
dat.validation = dat[-i.train,]
#(a)
# Is this a regression or a classification problem?
# regression
#(b)
# Quote the number P of predictors present in this dataset.
ncol(dat)-1
# (c)
# Create a scaled copy dat.s of dataset dat, using min-max normalisation
(apply this scaling to the response variable also).
#(i)
(i)
# Quote the 5-number summaries of dat.s$Sales, dat.s$BudgOp and
# provide the frequency distribution table for dat.s$Training
minmax.scale <- function(x){
if(!is.factor(x)){
xs=(x-min(x))/(max(x)-min(x))
}
else{
xs=x
}
return(xs)
}
dat.s=as.data.frame(lapply(dat,minmax.scale))
summary(dat.s$Sales)
summary(dat.s$BudgOp)
summary(dat.s$Training)
#(ii)fitted value
dat.s.train=dat.s[i.train,]
dat.s.validation= dat.s[-i.train,]
# Split the scaled data dat.s into training and validation subsets
# Fit two single-layer feed-forward neural networks, using respectively 3
# and 8 neurons in the hidden layer. Use the nnet library to do this and
# set the random seed to 6041 (set.seed(6041)) before performing any
# model fit. Quote the corresponding training Mean Squared Errors (MSEs)
set.seed(6041)
nn3=nnet(Sales~.,data = dat.s.train,size=3)
set.seed(6041)
nn8=nnet(Sales~.,data = dat.s.train,size=8)
mean(nn3$residuals^2)
mean(nn8$residuals^2)
#(iii)
# Generate predictions for the validation set dat.s.validation from
# each of the neural networks trained in (ii). Quote the corresponding
# validation MSEs.
# predict for validation data
y.test=dat.s.validation$Sales
p3= predict(nn3,dat.s.validation)
p8= predict(nn8,dat.s.validation)
mean((p3-y.test)^2)
mean((p8-y.test)^2)
# (iv)
# Suggest an explanation for the difference between the training and
# validation errors for each of these neural networks.
# we only do this case once
# we don’t have an idea about the distribution error, this comparison is very limited
# (d)
# Set random seed to 6041 (set.seed(6041)) and fit a gradient boosting model
# to the training data dat.train, using package gbm, and using 100 weak
# learners for this ensemble. Quote the corresponding
# training and validation MSEs.
set.seed(6041)
gbmo = gbm(Sales~.,data = dat.train,distribution = 'gaussian',
n.trees = 100)
gbmp= predict(gbmo,dat.validation)
mean((gbmo$fit-dat.train$Sales)^2)
mean((gbmp-dat.validation$Sales)^2)
# (e)
# Set random seed to 6041 (set.seed(6041)) and
#fit a generalized linear regression model to the training data dat.train.
# Quote the corresponding training and validation MSEs.
set.seed(6041)
glmo = glm(Sales~.,data = dat.train,family = 'gaussian')
glmp= predict(glmo,dat.validation)
mean((glmo$fit-dat.train$Sales)^2)
mean((glmp-dat.validation$Sales)^2)
# (f)
# Set random seed to 6041 (set.seed(6041)) and
# fit a ridge regression model to the training data dat.train.
# Quote the corresponding training and validation MSEs.
set.seed(6041)
xm= model.matrix(Sales~.+0,data = dat.train)
y = dat.train$Sales
lasso.cv = cv.glmnet(xm, y, family="gaussian")
ridge = glmnet(xm,y,alpha=0,lambda=lasso.cv$lambda.min)
ridge.fit= predict(ridge,xm)
xmv= model.matrix(Sales~.+0,data = dat.validation)
ridgep= predict(ridge,xmv)
mean((ridge.fit-dat.train$Sales)^2)
mean((ridgep-dat.validation$Sales)^2)
# (g) you need to rescaled the data
# Compare and comment on the validation errors obtained from the
# neural networks and from ridge regression.
set.seed(6041)
nnC = trainControl(method="cv", number=10,)
nno = train(Sales~.,data = dat.s.train, method="nnet",
trControl=nnC)
names(nno)
nno$bestTune
nno.fit = predict(nno,dat.s.train)
nno.res = nno.fit-dat.s.train$Sales
mean(nno.res^2)
po=predict(nno,dat.s.validation)
mean((po-y.test)^2)
# ???
pov=po*(max(dat$Sales)-min(dat$Sales))+min(dat$Sales)
mean((pov-dat.validation$Sales)^2)
# Question 3
# Wisconsin Breast Cancer Database
data(BreastCancer)
dim(BreastCancer)
# set up the data
dat=na.omit(BreastCancer)
dat$Id=NULL
n=nrow(dat)
i.train=sample(1:n,600,replace=F)
dat.train=dat[i.train,]
dat.validation=dat[-i.train,]
# (a)
# Set random seed to 4061 (set.seed(4061)) and fit a random forest model to
# the training set, performing a simple 10-fold cross-validation for training.
# Obtain predictions from this model for the validation set dat.validation.
# (i) Quote the number of variables used at each split.
# (ii) Provide the test set prediction accuracy achieved with this model.
# random forest
set.seed(4061)
rf.Control=trainControl(method='cv',number=10)
rf.out=caret::train(Class~.,trControl=rf.Control,data=dat.train,method='rf')
rf.pred=predict(rf.out,dat.validation)
rf.cm=confusionMatrix(reference=dat.validation$Class,data=rf.pred)
# SVM(linear)
set.seed(4061)
svm.Control=trainControl(method='cv',number=10)
svm.out=caret::train(Class~.,trControl=svm.Control,data=dat.train,method='svmLinear')
svm.pred=predict(svm.out,dat.validation)
svm.cm=confusionMatrix(reference=dat.validation$Class,data=svm.pred)
# SVM(radial)
set.seed(4061)
svmR.Control=trainControl(method='cv',number=10)
svmR.out=caret::train(Class~.,trControl=svmR.Control,data=dat.train,method='svmRadial')
svmR.pred=predict(svmR.out,dat.validation)
svmR.cm=confusionMatrix(reference=dat.validation$Class,data=svmR.pred)
# which model is deemed better?
round(cbind(rf.cm$overall,svm.cm$overall,svmR.cm$overall),3)
# None, really, as CI's around accuracies are comparable
# variable importance
varImp(rf.out)