统计预测与决策第二章代码重现

最新推荐文章于 2025-04-25 18:39:33 发布

Ba(OH)2

最新推荐文章于 2025-04-25 18:39:33 发布

阅读量178

点赞数 3

分类专栏：统计预测与决策文章标签： r语言-4.2.1 笔记

本文链接：https://blog.csdn.net/m0_74802716/article/details/137033742

版权

统计预测与决策专栏收录该内容

2 篇文章

订阅专栏

##统计预测与决策第二章代码
rm(list=ls())
#探索性数据分析
#散点图绘制
library(AppliedPredictiveModeling)
data(solubility)
library(lattice)
par(mar = c(5, 4, 4, 2) + 0.1)
xyplot(solTrainY~solTrainX$MolWeight,type=c("p","g"),
      ylab="Solubility(log)",
      main="(a)",
      xlab="Molecular Weight")
xyplot(solTrainY ~ solTrainX$NumRotBonds,type=c("p","g"),
       ylab="Solubility(log)",
       xlab="Number of Rotatable Bonds")

#箱线图绘制
bwplot(solTrainY~ifelse(solTrainX[,100]==1,
                        "structure present",
                        "structure absent"),
       ylab="Solubility(log)",
       main="(b)",
       horizontal = FALSE)
#使用了条件语句ifelse来判断solTrainX的第100列是否等于1，如果等于1则返回"structure present"，否则返回"structure absent"。
#用于指定图形的方向。设置为FALSE表示绘制垂直方向的条形图。

#特征选择以及可视化
Fingerprints=grep("FP",names(solTrainXtrans))
library(caret)
featurePlot(solTrainXtrans[,-Fingerprints],
            solTrainY,
            between=list(x=1,y=1),
            type=c("g","p","smooth"),
            labels=rep("",2))
#solTrainXtrans[, -Fingerprints]是输入的特征矩阵
#solTrainY是响应变量
#between = list(x = 1, y = 1)是一个列表，指定了在x轴和y轴上要显示的变量。
#在这里，它们都被设置为1，表示只显示第一个变量。
#"g"、"p"和"smooth"，表示绘制散点图、直方图和平滑曲线。

#相关系数
library(corrplot)
corrplot::corrplot(cor(solTestXtrans[,-Fingerprints]),
                  order="hclust",
                  tl.cex=.8)
#order = "hclust"表示使用层次聚类方法对变量进行排序
#tl.cex = 0.8设置标签字体大小为0.8

#2.2线性回归
##创建交叉验证的训练参数控制对象
set.seed(100)
indx=createFolds(solTrainY,returnTrain = TRUE) #创建训练集和验证集的索引
ctrl=trainControl(method="cv",index=indx) #创建训练控制参数对象

##模型的训练和调优
set.seed(100)
lmTune0=train(x=solTrainXtrans,y=solTrainY,
              method="lm", #指定使用线性方法
              trControl = ctrl) #控制训练过程的参数
#solTrainXtrans是经过转换的训练特征矩阵
lmTune0


##线性回归模型的训练和调优
#筛选相关性系数大于0.9
tooHigh=findCorrelation(cor(solTrainXtrans),.9)
trainXfiltered=solTrainXtrans[,-tooHigh]
testXfiltered=solTestXtrans[,-tooHigh]
set.seed(100)
lmTune=train(x=trainXfiltered,y=solTrainY,
             method="lm",
             trControl=ctrl)
lmTune

##评估模型性能
testResults=data.frame(obs=solTestY,
                      Linear_Regression=predict(lmTune,testXfiltered))

##2.3
#PLS
set.seed(100)
plsTune=train(x=solTrainXtrans,y=solTrainY,
              method  = "pls",
              tuneGrid=expand.grid(ncomp=1:20),  #定义参数网格，ncomp取值范围 1到2
              trControl=ctrl) #控制训练过程的参数
#ncomp表示PLS模型的主成分数量，取值范围为1到20
plsTune

##PCR
testResult$PLS=predict(plsTune,solTestXtrans)
set.seed(100)
pcrTune=train(x=solTrainXtrans,y=solTrainY,
              method="pcr",
              tuneGrid=expand.grid(ncomp=1:35),
              trControl=ctrl)
pcrTune

##模型对比
library(lattice)
plsResamples=plsTune$results
plsResamples$Model="pls"
pcrResamples=pcrTune$results
pcrResamples$Model="pcr"
plsPlotData=rbind(plsResamples,pcrResamples)
xyplot(RMSE~ncomp,
       data=plsPlotData,
       xlab="成分",
       ylab="RMSE",
       auto.key=list(colums=2),
       groups=Model,
       type=c("o","g"))
library(randomForest)
plsImp=varImp(plsTune,scale=FALSE)
plot(plsImp,top=25,scales=list(y=list(cex=0.95)))

##岭回归模型调优
ridgeGrid=expand.grid(lambda=seq(0,0.1,length=15)) #创建从0到0.1的15个不同的λ值。
set.seed(100)
ridgeTune=train(x=solTrainXtrans,y=solTrainY,
                method="ridge",
                tuneGrid=ridgeGrid, #参数λ的网格
                trControl=ctrl,
                preProc=c("center","scale")) #对数据进行中心化和缩放
ridgeTune
#绘制一个岭回归模型的调参曲线图
print(updata(plot(ridgeTune),xlab="Penalty"))


##enet模型
##参数范围确定
enetGrid=expand.grid(lambda=c(0,0.01,0.1),
                     fraction=seq(0.05,1,length=20))
set.seed(100)
enetTune=train(x=solTrainXtrans,y=solTrainY,
           method="enet",
           tuneGrid=enetGrid,
           trControl = ctrl,
           preProc=c("center","scale"))
enetTune
plot(enetTune)
testResults$Enet=predict(enetTune,solTestXtrans)