2020ericfinaltest

rm(list=ls())
library(Matrix)
library(glmnet)
library(survival)
library(ISLR)
library(MASS)
library(class)
library(caret)
library(gbm)
library(ISLR)
library(nnet)
library(pROC)
library(randomForest)


#2
dat=read.csv(file = 'dodgysales.csv',stringsAsFactors = TRUE)

n = nrow(dat) 
set.seed(6041) 
i.train = sample(1:n, floor(.7*n)) 
dat.train = dat[i.train,] 
dat.validation = dat[-i.train,] 
#(1)regression
dat$Sales
#(2)
ncol(dat)-1

#(3)
#(i)
minmax.scale <- function(x){
 if(!is.factor(x)){
   xs=(x-min(x))/(max(x)-min(x))
 }
  else{
    xs=x
  }
  return(xs)
}

dat.s=as.data.frame(lapply(dat,minmax.scale))
summary(dat.s$Sales)
summary(dat.s$BudgOp)
summary(dat.s$Training)


dat.s.train=dat.s[i.train,]
dat.s.validation= dat.s[-i.train,]

set.seed(6041)
nn3=nnet(Sales~.,data = dat.s.train,size=3)
set.seed(6041)
nn8=nnet(Sales~.,data = dat.s.train,size=8)

#(ii)fitted value
mean(nn3$residuals^2)
mean(nn8$residuals^2)

# predict for validation data
y.test=dat.s.validation$Sales
p3= predict(nn3,dat.s.validation)
p8= predict(nn8,dat.s.validation)
#(iii)
mean((p3-y.test)^2)
mean((p8-y.test)^2)

#(iv)
set.seed(6041)
nnC = trainControl(method="cv", number=10,)
nno = train(Sales~.,data = dat.s.train,  method="nnet",
            trControl=nnC)
names(nno)
nno$bestTune
nno.fit = predict(nno,dat.s.train)
nno.res = nno.fit-dat.s.train$Sales
mean(nno.res^2)
po=predict(nno,dat.s.validation)
mean((po-y.test)^2)

#D     GBM
set.seed(6041)
gbmo = gbm(Sales~.,data = dat.train,distribution = 'gaussian',
           n.trees = 100)
gbmp= predict(gbmo,dat.validation)
mean((gbmo$fit-dat.train$Sales)^2)
mean((gbmp-dat.validation$Sales)^2)

#E     glm
set.seed(6041)
glmo = glm(Sales~.,data = dat.train,family = 'gaussian')
glmp= predict(glmo,dat.validation)
mean((glmo$fit-dat.train$Sales)^2)
mean((glmp-dat.validation$Sales)^2)


#F    ridge
set.seed(6041)
xm= model.matrix(Sales~.+0,data = dat.train)
y = dat.train$Sales
lasso.cv = cv.glmnet(xm, y, family="gaussian")
ridge = glmnet(xm,y,alpha=0,lambda=lasso.cv$lambda.min)
ridge.fit= predict(ridge,xm)
xmv= model.matrix(Sales~.+0,data = dat.validation)
ridgep= predict(ridge,xmv)
mean((ridge.fit-dat.train$Sales)^2)
mean((ridgep-dat.validation$Sales)^2)


#g you need to rescaled the data
pov=po*(max(dat$Sales)-min(dat$Sales))+min(dat$Sales)
mean((pov-dat.validation$Sales)^2)

# 2020-2021
# Question 3
# Wisconsin Breast Cancer Database
library(mlbench)
data(BreastCancer)
dim(BreastCancer)
# set up the data
dat=na.omit(BreastCancer)
dat$Id=NULL
n=nrow(dat)
i.train=sample(1:n,600,replace=F)
dat.train=dat[i.train,]
dat.validation=dat[-i.train,]
# random forest
set.seed(4061)
rf.Control=trainControl(method='cv',number=10)
rf.out=caret::train(Class~.,trControl=rf.Control,data=dat.train,method='rf')
rf.pred=predict(rf.out,dat.validation)
rf.cm=confusionMatrix(reference=dat.validation$Class,data=rf.pred)
# SVM(linear)
set.seed(4061)
svm.Control=trainControl(method='cv',number=10)
svm.out=caret::train(Class~.,trControl=svm.Control,data=dat.train,method='svmLinear')
svm.pred=predict(svm.out,dat.validation)
svm.cm=confusionMatrix(reference=dat.validation$Class,data=svm.pred)
# SVM(radial)
set.seed(4061)
svmR.Control=trainControl(method='cv',number=10)
svmR.out=caret::train(Class~.,trControl=svmR.Control,data=dat.train,method='svmRadial')
svmR.pred=predict(svmR.out,dat.validation)
svmR.cm=confusionMatrix(reference=dat.validation$Class,data=svmR.pred)
# which model is deemed better?
round(cbind(rf.cm$overall,svm.cm$overall,svmR.cm$overall),3)
# None, really, as CI's around accuracies are comparable
# variable importance
varImp(rf.out)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值