rm(list=ls())
library(Matrix)
library(glmnet)
library(survival)
library(ISLR)
library(MASS)
library(class)
library(caret)
library(gbm)
library(ISLR)
library(nnet)
library(pROC)
library(randomForest)
#2
dat=read.csv(file = 'dodgysales.csv',stringsAsFactors = TRUE)
n = nrow(dat)
set.seed(6041)
i.train = sample(1:n, floor(.7*n))
dat.train = dat[i.train,]
dat.validation = dat[-i.train,]
#(1)regression
dat$Sales
#(2)
ncol(dat)-1
#(3)
#(i)
minmax.scale <- function(x){
if(!is.factor(x)){
xs=(x-min(x))/(max(x)-min(x))
}
else{
xs=x
}
return(xs)
}
dat.s=as.data.frame(lapply(dat,minmax.scale))
summary(dat.s$Sales)
summary(dat.s$BudgOp)
summary(dat.s$Training)
dat.s.train=dat.s[i.train,]
dat.s.validation= dat.s[-i.train,]
set.seed(6041)
nn3=nnet(Sales~.,data = dat.s.train,size=3)
set.seed(6041)
nn8=nnet(Sales~.,data = dat.s.train,size=8)
#(ii)fitted value
mean(nn3$residuals^2)
mean(nn8$residuals^2)
# predict for validation data
y.test=dat.s.validation$Sales
p3= predict(nn3,dat.s.validation)
p8= predict(nn8,dat.s.validation)
#(iii)
mean((p3-y.test)^2)
mean((p8-y.test)^2)
#(iv)
set.seed(6041)
nnC = trainControl(method="cv", number=10,)
nno = train(Sales~.,data = dat.s.train, method="nnet",
trControl=nnC)
names(nno)
nno$bestTune
nno.fit = predict(nno,dat.s.train)
nno.res = nno.fit-dat.s.train$Sales
mean(nno.res^2)
po=predict(nno,dat.s.validation)
mean((po-y.test)^2)
#D GBM
set.seed(6041)
gbmo = gbm(Sales~.,data = dat.train,distribution = 'gaussian',
n.trees = 100)
gbmp= predict(gbmo,dat.validation)
mean((gbmo$fit-dat.train$Sales)^2)
mean((gbmp-dat.validation$Sales)^2)
#E glm
set.seed(6041)
glmo = glm(Sales~.,data = dat.train,family = 'gaussian')
glmp= predict(glmo,dat.validation)
mean((glmo$fit-dat.train$Sales)^2)
mean((glmp-dat.validation$Sales)^2)
#F ridge
set.seed(6041)
xm= model.matrix(Sales~.+0,data = dat.train)
y = dat.train$Sales
lasso.cv = cv.glmnet(xm, y, family="gaussian")
ridge = glmnet(xm,y,alpha=0,lambda=lasso.cv$lambda.min)
ridge.fit= predict(ridge,xm)
xmv= model.matrix(Sales~.+0,data = dat.validation)
ridgep= predict(ridge,xmv)
mean((ridge.fit-dat.train$Sales)^2)
mean((ridgep-dat.validation$Sales)^2)
#g you need to rescaled the data
pov=po*(max(dat$Sales)-min(dat$Sales))+min(dat$Sales)
mean((pov-dat.validation$Sales)^2)
# 2020-2021
# Question 3
# Wisconsin Breast Cancer Database
library(mlbench)
data(BreastCancer)
dim(BreastCancer)
# set up the data
dat=na.omit(BreastCancer)
dat$Id=NULL
n=nrow(dat)
i.train=sample(1:n,600,replace=F)
dat.train=dat[i.train,]
dat.validation=dat[-i.train,]
# random forest
set.seed(4061)
rf.Control=trainControl(method='cv',number=10)
rf.out=caret::train(Class~.,trControl=rf.Control,data=dat.train,method='rf')
rf.pred=predict(rf.out,dat.validation)
rf.cm=confusionMatrix(reference=dat.validation$Class,data=rf.pred)
# SVM(linear)
set.seed(4061)
svm.Control=trainControl(method='cv',number=10)
svm.out=caret::train(Class~.,trControl=svm.Control,data=dat.train,method='svmLinear')
svm.pred=predict(svm.out,dat.validation)
svm.cm=confusionMatrix(reference=dat.validation$Class,data=svm.pred)
# SVM(radial)
set.seed(4061)
svmR.Control=trainControl(method='cv',number=10)
svmR.out=caret::train(Class~.,trControl=svmR.Control,data=dat.train,method='svmRadial')
svmR.pred=predict(svmR.out,dat.validation)
svmR.cm=confusionMatrix(reference=dat.validation$Class,data=svmR.pred)
# which model is deemed better?
round(cbind(rf.cm$overall,svm.cm$overall,svmR.cm$overall),3)
# None, really, as CI's around accuracies are comparable
# variable importance
varImp(rf.out)