应用回归及分类——基于R
第二章 经典线性回归——交叉验证比较模型
########第二章########
###2.4.1线性模型###
Puromycin<-read.csv("F:/R/应用回归分析/数据和代码/PureData/Puromycin.csv")
#y=b0+b1x+c
a1=lm(rate ~ conc,Puromycin)
summary(a1)
shapiro.test(a1$res)#对残差的shapiro正态性检验,原假设:正态性
#加上定性变量state:y=b0+b1x+ai+c,ai代表state两个水平的效应
a2=lm(rate ~ .,Puromycin)
summary(a2)
shapiro.test(a2$res)
anova(a2)#定性变量要用方差分析检验
#加入交叉效应:y=b0+(b1+ri)x+ai+c,ri(i=1,2)代表state两个水平对x斜率的增量
a3=lm(rate ~ conc * state,Puromycin)
summary(a3)
shapiro.test(a3$res)
anova(a3)
#对conc做对数变换:y=b0+b1ln(x)+c
a4=lm(rate ~ log(conc),Puromycin)
summary(a4)
shapiro.test(a4$res)
#对a4加上定性变量state:y=b0+b1ln(x)+ai+c
a5=lm(rate ~ log(conc) + state,Puromycin)
summary(a5)
shapiro.test(a5$res)
anova(a5)
#再加入交叉效应:y=b0+(b1+ri)ln(x)+ai+c
a6=lm(rate ~ log(conc) * state,Puromycin)
summary(a6)
shapiro.test(a6$res)
anova(a6)
#######2.4.4交叉验证######
###划分数据集###
Fold=function(Z=5,w,D,seed=7777){
n=nrow(w);d=1:n;dd=list()
e=levels(w[,D]);T=length(e)#因变量T类
set.seed(seed)
for(i in 1:T){
d0=d[w[,D]==e[i]];j=length(d0)
ZT=rep(1:Z,ceiling(j/Z))[1:j]
id=cbind(sample(ZT,length(ZT)),d0);dd[[i]]=id}
#上面每个dd[[i]]是随机1:Z及i类的下标集组成的矩阵
mm=list()
for(i in 1:Z){u=NULL;
for(j in 1:T)u=c(u,dd[[j]][dd[[j]][,1]==i,2])
mm[[i]]=u} #mm[[i]]为第i个下标集i=1,...,Z
return(mm)}#输出Z个下标集
###创造6个模型的回归公式###
w=Puromycin; D=2; FML=list();J=1
FML[[J]]=as.formula(paste(names(w)[D],"~",names(w)[1]))
J=J+1; FML[[J]]=as.formula(paste(names(w)[D],"~."))
J=J+1;FML[[J]]=as.formula(paste(names(w)[D],"~",names(w)[1],
"*",names(w)[3]))
J=J+1;
FML[[J]]=as.formula(paste(names(w)[D],"~log(",names(w)[1],")"))
J=J+1;FML[[J]]=as.formula(paste(names(w)[D],
"~log(",names(w)[1],")+",names(w)[3]))
J=J+1; FML[[J]]=as.formula(paste(names(w)[D],
"~log(",names(w)[1],")*",names(w)[3]))
###具体的1000次5折交叉验证,每次的随机种子完全随机产生(不重复)###
JJ=6;D=2;Z=5;WW=NULL;N=1000
set.seed(1010);Seed=sample(1:100000,N)
for(k in 1:N){
mm=Fold(Z=5,w,3,Seed[N])
E=matrix(-99,Z,JJ)
for(J in 1:JJ){for(i in 1:Z){
m=mm[[i]];M=mean((w[m,D]-mean(w[m,D]))^2)
a0=lm(FML[[J]],data=w[-m,])
pa=predict(a0,w[m,])
E[i,J]=mean((w[m,D]-pa)^2)/M}}
WW=rbind(WW,E)}
(ZZ=apply(WW,2,mean))#最后输出的6个平均NMSE
######2.5######
w<-read.csv("F:/R/应用回归分析/数据和代码/PureData/Concrete.csv")
plot(w)
#线性回归,不考虑交互效应
a=lm(Compressive.strength~.,w)
shapiro.test(a$res)
#作残差的正态QQ图和残差-拟合值图
par(mfrow=c(1,2))
qqnorm(a$res);qqline(a$res)
plot(a$res~a$fit);abline(h=0)
summary(a)
#################和其他方法的交叉验证比较####################
####CV函数随机把数据的下标分成Z份以做交叉验证时用
CV=function(n,Z=10,seed=888){
z=rep(1:Z,ceiling(n/Z))[1:n]
set.seed(seed);z=sample(z,n)
mm=list();for (i in 1:Z) mm[[i]]=(1:n)[z==i]
return(mm)}
##################################################
#install.packages("rpart.plot")
#install.packages("ipred")
#install.packages("mboost")
#install.packages("randomForest")
#install.packages("kernlab")
#install.packages("e1071")
#install.packages("nnet")
#install.packages("neuralnet")
library(rpart.plot)
library(ipred)
library(mboost)
library(randomForest)
library(kernlab)
library(e1071)
library(nnet)
library(neuralnet)
D=9
Z=10
mm=CV(nrow(w),Z)
gg=paste(names(w)[D],"~",".",sep="")#gg=(Ozone~.)
(gg=as.formula(gg))
zy=(1:ncol(w))[-D]
gg1=paste(names(w)[D],"~btree(",names(w)[zy[1]],")",sep="")
for(i in (1:ncol(w))[-D][-1])gg1=paste(gg1,
"+btree(",names(w)[i],")",sep="")
gg1=as.formula(gg1)
MSE=matrix(0,Z,6);J=1
set.seed(1010);for(i in 1:Z)
{m=mm[[i]];M=mean((w[m,D]-mean(w[m,D]))^2)
a=mboost(gg1,data =w[-m,])
MSE[i,J]=mean((w[m,D]-predict(a,w[m,]))^2)/M}
J=J+1;set.seed(1010);for(i in 1:Z)
{m=mm[[i]];M=mean((w[m,D]-mean(w[m,D]))^2)
a=bagging(gg,data =w[-m,])
MSE[i,J]=mean((w[m,D]-predict(a,w[m,]))^2)/M}
J=J+1;set.seed(1010);for(i in 1:Z)
{m=mm[[i]];M=mean((w[m,D]-mean(w[m,D]))^2)
a=randomForest(gg,data=w[-m,])
MSE[i,J]=mean((w[m,D]-predict(a,w[m,]))^2)/M }
J=J+1;for(i in 1:Z)
{m=mm[[i]];M=mean((w[m,D]-mean(w[m,D]))^2)
a=ksvm(gg,w[-m,],model="svm")
MSE[i,J]=mean((w[m,D]-predict(a,w[m,]))^2)/M }
J=J+1;set.seed(1010);for(i in 1:Z){
m=mm[[i]];M=mean((w[m,D]-mean(w[m,D]))^2)
a=nnet(w[-m,D]/max(w[,D]) ~ ., data=w[-m,], size=6, decay=0.1)
MSE[i,J]=mean((w[m,D]-predict(a,w[m,])*max(w[,D]))^2)/M}
J=J+1;for(i in 1:Z)
{m=mm[[i]];M=mean((w[m,D]-mean(w[m,D]))^2)
a=lm(gg,w[-m,])#线性回归
MSE[i,J]=mean((w[m,D]-predict(a,w[m,]))^2)/M}
MSE=data.frame(MSE)
names(MSE)=c("adaboost","bagging","RF","SVM","NNET","lm")
(NMSE=apply(MSE,2,mean));MSE
####作图
#绘制折线图
attach(MSE)
plot(adaboost,type = "b",main='10-fold cross validation',xlab='Fold',ylab='MSE',ylim=c(0,0.5))
lines(bagging,type = "o",col='blue')
lines(RF,type = "l",col='green')
lines(SVM,type = "b",pch = 2,col='red')
lines(NNET,type = "b",pch = 3,col='yellow')
lines(lm,type = "b",pch = 4,col='black')
library(Hmisc)
legend("topleft",c('adaboost','bagging','RF','SVM','NNET','lm'),pch=c(1, 1, 1,2,3,4),
lty=c(1,1,1,1,1,1),col=c("black","blue",'green',"red",'yellow','black'))
detach(MSE)
#绘制柱状图
barplot(NMSE)
在这里插入代码片