20-21重制版

最新推荐文章于 2024-05-27 17:00:50 发布

henzo16

最新推荐文章于 2024-05-27 17:00:50 发布

阅读量70

点赞数

文章标签： python

本文链接：https://blog.csdn.net/henzo16/article/details/124124269

版权

TEST 2020-21

# Question 1

# Question 2

dat=read.csv(file = 'dodgysales.csv',stringsAsFactors = TRUE)

n = nrow(dat)

set.seed(6041)

i.train = sample(1:n, floor(.7*n))

dat.train = dat[i.train,]

dat.validation = dat[-i.train,]

#(a)

# Is this a regression or a classification problem?

# regression

#(b)

# Quote the number P of predictors present in this dataset.

ncol(dat)-1

# (c)

# Create a scaled copy dat.s of dataset dat, using min-max normalisation

(apply this scaling to the response variable also).

#(i)

(i)

# Quote the 5-number summaries of dat.s$Sales, dat.s$BudgOp and

# provide the frequency distribution table for dat.s$Training

minmax.scale <- function(x){

if(!is.factor(x)){

xs=(x-min(x))/(max(x)-min(x))

}

else{

xs=x

}

return(xs)

}

dat.s=as.data.frame(lapply(dat,minmax.scale))

summary(dat.s$Sales)

summary(dat.s$BudgOp)

summary(dat.s$Training)

#(ii)fitted value

dat.s.train=dat.s[i.train,]

dat.s.validation= dat.s[-i.train,]

# Split the scaled data dat.s into training and validation subsets

# Fit two single-layer feed-forward neural networks, using respectively 3

# and 8 neurons in the hidden layer. Use the nnet library to do this and

# set the random seed to 6041 (set.seed(6041)) before performing any

# model fit. Quote the corresponding training Mean Squared Errors (MSEs)

set.seed(6041)

nn3=nnet(Sales~.,data = dat.s.train,size=3)

set.seed(6041)

nn8=nnet(Sales~.,data = dat.s.train,size=8)

mean(nn3$residuals^2)

mean(nn8$residuals^2)

#(iii)

# Generate predictions for the validation set dat.s.validation from

# each of the neural networks trained in (ii). Quote the corresponding

# validation MSEs.

# predict for validation data

y.test=dat.s.validation$Sales

p3= predict(nn3,dat.s.validation)

p8= predict(nn8,dat.s.validation)

mean((p3-y.test)^2)

mean((p8-y.test)^2)

# (iv)

# Suggest an explanation for the difference between the training and

# validation errors for each of these neural networks.

# we only do this case once

# we don’t have an idea about the distribution error, this comparison is very limited

# (d)

# Set random seed to 6041 (set.seed(6041)) and fit a gradient boosting model

# to the training data dat.train, using package gbm, and using 100 weak

# learners for this ensemble. Quote the corresponding

# training and validation MSEs.

set.seed(6041)

gbmo = gbm(Sales~.,data = dat.train,distribution = 'gaussian',

n.trees = 100)

gbmp= predict(gbmo,dat.validation)

mean((gbmo$fit-dat.train$Sales)^2)

mean((gbmp-dat.validation$Sales)^2)

# (e)

# Set random seed to 6041 (set.seed(6041)) and

#fit a generalized linear regression model to the training data dat.train.

# Quote the corresponding training and validation MSEs.

set.seed(6041)

glmo = glm(Sales~.,data = dat.train,family = 'gaussian')

glmp= predict(glmo,dat.validation)

mean((glmo$fit-dat.train$Sales)^2)

mean((glmp-dat.validation$Sales)^2)

# (f)

# Set random seed to 6041 (set.seed(6041)) and

# fit a ridge regression model to the training data dat.train.

# Quote the corresponding training and validation MSEs.

set.seed(6041)

xm= model.matrix(Sales~.+0,data = dat.train)

y = dat.train$Sales

lasso.cv = cv.glmnet(xm, y, family="gaussian")

ridge = glmnet(xm,y,alpha=0,lambda=lasso.cv$lambda.min)

ridge.fit= predict(ridge,xm)

xmv= model.matrix(Sales~.+0,data = dat.validation)

ridgep= predict(ridge,xmv)

mean((ridge.fit-dat.train$Sales)^2)

mean((ridgep-dat.validation$Sales)^2)

# (g) you need to rescaled the data

# Compare and comment on the validation errors obtained from the

# neural networks and from ridge regression.

set.seed(6041)

nnC = trainControl(method="cv", number=10,)

nno = train(Sales~.,data = dat.s.train, method="nnet",

trControl=nnC)

names(nno)

nno$bestTune

nno.fit = predict(nno,dat.s.train)

nno.res = nno.fit-dat.s.train$Sales

mean(nno.res^2)

po=predict(nno,dat.s.validation)

mean((po-y.test)^2)

# ???

pov=po*(max(dat$Sales)-min(dat$Sales))+min(dat$Sales)

mean((pov-dat.validation$Sales)^2)

# Question 3

# Wisconsin Breast Cancer Database

data(BreastCancer)

dim(BreastCancer)

# set up the data

dat=na.omit(BreastCancer)

dat$Id=NULL

n=nrow(dat)

i.train=sample(1:n,600,replace=F)

dat.train=dat[i.train,]

dat.validation=dat[-i.train,]

# (a)

# Set random seed to 4061 (set.seed(4061)) and fit a random forest model to

# the training set, performing a simple 10-fold cross-validation for training.

# Obtain predictions from this model for the validation set dat.validation.

# (i) Quote the number of variables used at each split.

# (ii) Provide the test set prediction accuracy achieved with this model.

# random forest

set.seed(4061)

rf.Control=trainControl(method='cv',number=10)

rf.out=caret::train(Class~.,trControl=rf.Control,data=dat.train,method='rf')

rf.pred=predict(rf.out,dat.validation)

rf.cm=confusionMatrix(reference=dat.validation$Class,data=rf.pred)

# SVM(linear)

set.seed(4061)

svm.Control=trainControl(method='cv',number=10)

svm.out=caret::train(Class~.,trControl=svm.Control,data=dat.train,method='svmLinear')

svm.pred=predict(svm.out,dat.validation)

svm.cm=confusionMatrix(reference=dat.validation$Class,data=svm.pred)

# SVM(radial)

set.seed(4061)

svmR.Control=trainControl(method='cv',number=10)

svmR.out=caret::train(Class~.,trControl=svmR.Control,data=dat.train,method='svmRadial')

svmR.pred=predict(svmR.out,dat.validation)

svmR.cm=confusionMatrix(reference=dat.validation$Class,data=svmR.pred)

# which model is deemed better?

round(cbind(rf.cm$overall,svm.cm$overall,svmR.cm$overall),3)

# None, really, as CI's around accuracies are comparable

# variable importance

varImp(rf.out)

henzo16

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
20-21重制版

TEST 2020-21# Question 1# Question 2dat=read.csv(file = 'dodgysales.csv',stringsAsFactors = TRUE)n = nrow(dat)set.seed(6041)i.train = sample(1:n, floor(.7*n))dat.train = dat[i.train,]dat.validation = dat[-i.train,]#(a)# Is this a
复制链接

扫一扫