Table of Contents
数据预处理
library(ggplot2)
library(dplyr)
#导入包含数据的库
library("ISLR")
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
options (warn = -1)
#查看数据
data(Hitters)
#查看数据包含信息的具体含义,以便后续分析
?Hitters
fix(Hitters)
# 查看数据包含哪些内容
names(Hitters)
- 'AtBat'
- 'Hits'
- 'HmRun'
- 'Runs'
- 'RBI'
- 'Walks'
- 'Years'
- 'CAtBat'
- 'CHits'
- 'CHmRun'
- 'CRuns'
- 'CRBI'
- 'CWalks'
- 'League'
- 'Division'
- 'PutOuts'
- 'Assists'
- 'Errors'
- 'Salary'
- 'NewLeague'
#对数据进行预处理:删除存在空缺值的行
data<- na.omit(Hitters)
dim(data)
- 263
- 20
#绘制散点图
library(car)
scatterplotMatrix(data[c(12,3,4,5),c(1,2,3,4,5)],spread=FALSE,lty.smooth=2)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-2DDzPp4v-1680423860901)(output_7_0.png)]
一元线性回归
#调整数据删除非数值数据(设置一个空值)
fix(data)
#对所有数值型属性计算pearson相关系数,搜索工资与那个属性的线性相关性很高
cor_pea <- cor(data[,complete.cases(t(data))],method='pearson')
# cor_pea[17,]
cor_pea
heatmap(cor_pea)
AtBat | Hits | HmRun | Runs | RBI | Walks | Years | CAtBat | CHits | CHmRun | CRuns | CRBI | CWalks | PutOuts | Assists | Errors | Salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AtBat | 1.0000000 | 0.96396913 | 0.555102154 | 0.89982910 | 0.79601539 | 0.6244481 | 0.01272550 | 0.207166254 | 0.22534146 | 0.21242155 | 0.23727777 | 0.22139318 | 0.13292568 | 0.30960746 | 0.342117377 | 0.325576978 | 0.394770945 |
Hits | 0.9639691 | 1.00000000 | 0.530627358 | 0.91063014 | 0.78847819 | 0.5873105 | 0.01859809 | 0.206677608 | 0.23560577 | 0.18936425 | 0.23889610 | 0.21938423 | 0.12297073 | 0.29968754 | 0.303974950 | 0.279876183 | 0.438674738 |
HmRun | 0.5551022 | 0.53062736 | 1.000000000 | 0.63107588 | 0.84910743 | 0.4404537 | 0.11348842 | 0.217463613 | 0.21749569 | 0.49252584 | 0.25834685 | 0.34985838 | 0.22718318 | 0.25093150 | -0.161601753 | -0.009743082 | 0.343028078 |
Runs | 0.8998291 | 0.91063014 | 0.631075883 | 1.00000000 | 0.77869235 | 0.6970151 | -0.01197495 | 0.171810798 | 0.19132697 | 0.22970104 | 0.23783121 | 0.20233548 | 0.16370021 | 0.27115986 | 0.179257859 | 0.192608787 | 0.419858559 |
RBI | 0.7960154 | 0.78847819 | 0.849107434 | 0.77869235 | 1.00000000 | 0.5695048 | 0.12966795 | 0.278125914 | 0.29213714 | 0.44218969 | 0.30722616 | 0.38777657 | 0.23361884 | 0.31206456 | 0.062901737 | 0.150154692 | 0.449457088 |
Walks | 0.6244481 | 0.58731051 | 0.440453717 | 0.69701510 | 0.56950476 | 1.0000000 | 0.13479270 | 0.269449974 | 0.27079505 | 0.34958216 | 0.33297657 | 0.31269680 | 0.42913990 | 0.28085548 | 0.102522559 | 0.081937197 | 0.443867260 |
Years | 0.0127255 | 0.01859809 | 0.113488420 | -0.01197495 | 0.12966795 | 0.1347927 | 1.00000000 | 0.915680692 | 0.89784449 | 0.72237071 | 0.87664855 | 0.86380936 | 0.83752373 | -0.02001921 | -0.085117725 | -0.156511957 | 0.400656994 |
CAtBat | 0.2071663 | 0.20667761 | 0.217463613 | 0.17181080 | 0.27812591 | 0.2694500 | 0.91568069 | 1.000000000 | 0.99505681 | 0.80167609 | 0.98274694 | 0.95073014 | 0.90671165 | 0.05339251 | -0.007897271 | -0.070477521 | 0.526135310 |
CHits | 0.2253415 | 0.23560577 | 0.217495691 | 0.19132697 | 0.29213714 | 0.2707951 | 0.89784449 | 0.995056810 | 1.00000000 | 0.78665204 | 0.98454184 | 0.94679739 | 0.89071842 | 0.06734799 | -0.013144204 | -0.068035829 | 0.548909559 |
CHmRun | 0.2124215 | 0.18936425 | 0.492525845 | 0.22970104 | 0.44218969 | 0.3495822 | 0.72237071 | 0.801676089 | 0.78665204 | 1.00000000 | 0.82562483 | 0.92790264 | 0.81087827 | 0.09382223 | -0.188886464 | -0.165369407 | 0.524930560 |
CRuns | 0.2372778 | 0.23889610 | 0.258346846 | 0.23783121 | 0.30722616 | 0.3329766 | 0.87664855 | 0.982746941 | 0.98454184 | 0.82562483 | 1.00000000 | 0.94567701 | 0.92776846 | 0.05908718 | -0.038895093 | -0.094080542 | 0.562677711 |
CRBI | 0.2213932 | 0.21938423 | 0.349858379 | 0.20233548 | 0.38777657 | 0.3126968 | 0.86380936 | 0.950730141 | 0.94679739 | 0.92790264 | 0.94567701 | 1.00000000 | 0.88913701 | 0.09537515 | -0.096558877 | -0.115316131 | 0.566965686 |
CWalks | 0.1329257 | 0.12297073 | 0.227183183 | 0.16370021 | 0.23361884 | 0.4291399 | 0.83752373 | 0.906711655 | 0.89071842 | 0.81087827 | 0.92776846 | 0.88913701 | 1.00000000 | 0.05816016 | -0.066243445 | -0.129935875 | 0.489822036 |
PutOuts | 0.3096075 | 0.29968754 | 0.250931497 | 0.27115986 | 0.31206456 | 0.2808555 | -0.02001921 | 0.053392514 | 0.06734799 | 0.09382223 | 0.05908718 | 0.09537515 | 0.05816016 | 1.00000000 | -0.043390143 | 0.075305857 | 0.300480356 |
Assists | 0.3421174 | 0.30397495 | -0.161601753 | 0.17925786 | 0.06290174 | 0.1025226 | -0.08511772 | -0.007897271 | -0.01314420 | -0.18888646 | -0.03889509 | -0.09655888 | -0.06624345 | -0.04339014 | 1.000000000 | 0.703504693 | 0.025436136 |
Errors | 0.3255770 | 0.27987618 | -0.009743082 | 0.19260879 | 0.15015469 | 0.0819372 | -0.15651196 | -0.070477521 | -0.06803583 | -0.16536941 | -0.09408054 | -0.11531613 | -0.12993587 | 0.07530586 | 0.703504693 | 1.000000000 | -0.005400702 |
Salary | 0.3947709 | 0.43867474 | 0.343028078 | 0.41985856 | 0.44945709 | 0.4438673 | 0.40065699 | 0.526135310 | 0.54890956 | 0.52493056 | 0.56267771 | 0.56696569 | 0.48982204 | 0.30048036 | 0.025436136 | -0.005400702 | 1.000000000 |
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1lLvlhNI-1680423860904)(output_10_1.png)]
通过计算pearson相关系数发现工资水平与其他指标的线性关系并不是很强。
一元线性回归(留出法)
#设置随机种子
set.seed(1)
#划分训练集与测试集,从392个中选196个出来,这可以当做训练集
train=sample(dim(data)[1],dim(data)[1]*0.6)
#经上述查询发现CAtBat与CHits的线性程度最高,所以对二者进行线性拟合,
fit.lm<-lm(CAtBat~CHits,data,subset=train)
fit.lm
# Residuals—残差统计量、intercept-表示截距、Estimate-包含由普通最小二乘法计算出来的估计回归系数、Std.error-估计的回归系数的标准误差、
# Multiple R-squared-拟合优度越大越好、F-statistic-判断方程的显著性检验
summary(fit.lm)
Call:
lm(formula = CAtBat ~ CHits, data = data, subset = train)
Coefficients:
(Intercept) CHits
101.297 3.555
Call:
lm(formula = CAtBat ~ CHits, data = data, subset = train)
Residuals:
Min 1Q Median 3Q Max
-800.47 -75.06 -8.98 58.63 509.19
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 101.29721 21.94608 4.616 8.16e-06 ***
CHits 3.55539 0.02713 131.051 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 176.3 on 155 degrees of freedom
Multiple R-squared: 0.9911, Adjusted R-squared: 0.991
F-statistic: 1.717e+04 on 1 and 155 DF, p-value: < 2.2e-16
options (warn = -1)
#绑定数据直接使用列名进行访问
attach(data)
#计算测试均方误差
mean((CAtBat-predict(fit.lm,data))[-train]^2)
# 重复运用验证集方法10次
err1=rep(0,10)
for ( i in 1 : 10 ) {
train2 <- sample ( dim(data)[1] , dim(data)[1]*0.6 )
lmfit2 <- lm ( CAtBat~CHits,data,subset=train2 )
pred2 <- predict ( lmfit2 , Auto [ - train2 , ] )
err1 [ i ] <- mean ( ( CAtBat [ - train2 ] - pred2 ) ^ 2 )
}
plot ( 1 : 10 , err1 , xlab = "" , type = "l" , main = "选取10个不同的训练集对应的测试误差" )
84470.5049254762
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-o9bUqzbi-1680423860905)(output_14_1.png)]
一元线性回归(交叉验证——留一法)
library(tidyverse)
#加载train函数的包
library(caret)
── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats [39m 1.0.0 [32m✔[39m [34mstringr [39m 1.5.0
[32m✔[39m [34mlubridate[39m 1.9.2 [32m✔[39m [34mtibble [39m 3.2.1
[32m✔[39m [34mpurrr [39m 1.0.1 [32m✔[39m [34mtidyr [39m 1.3.0
[32m✔[39m [34mreadr [39m 2.1.4
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m masks [34mstats[39m::lag()
[31m✖[39m [34mcar[39m::[32mrecode()[39m masks [34mdplyr[39m::recode()
[31m✖[39m [34mpurrr[39m::[32msome()[39m masks [34mcar[39m::some()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
# data[,8:9]
#定义方法为留一法
train.control <- trainControl(method='LOOCV')
#训练模型,“lm”表示选用线性回归模型
model <- train(CAtBat~CHits,data,method='lm',trControl = train.control)
model$finalModel
par(mfrow=c(2,2))
plot(model$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Coefficients:
(Intercept) CHits
122.56 3.51
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FAOGXJYT-1680423860906)(output_17_1.png)]
拟合优度R方很高,拟合效果较好。但对其进行线性回归的建设检验残差随机性较差。
一元线性回归(K折交叉验证)
#定义训练模型,设置随机种子,以k=10为例子
set.seed(123)
train.control <- trainControl(method ="cv",number=10)
#训练模型
model<- train(CAtBat~CHits,data,method="lm",trControl = train.control)
model$finalModel
par(mfrow=c(2,2))
plot(model$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Coefficients:
(Intercept) CHits
122.56 3.51
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hZ6X0J6D-1680423860907)(output_20_1.png)]
10折交叉验证的方法得到的最终模型比留一法好一些(R方)
重复K折交叉验证
#定义训练模型,设置随机种子,以k=10为例子,重复10次
set.seed(123)
train.control <- trainControl(method ="repeatedcv",number=10,repeats=10)
#训练模型
model<- train(CAtBat~CHits,data,method="lm",trControl = train.control)
model$finalModel
par(mfrow=c(2,2))
plot(model$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Coefficients:
(Intercept) CHits
122.56 3.51
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rsUP2NTY-1680423860907)(output_23_1.png)]
自助法
#定义训练模型,100次重采样
train.control <- trainControl(method ="boot",number=100)
#训练模型
model<- train(CAtBat~CHits,data,method="lm",trControl = train.control)
summary(model)
model$finalModel
par(mfrow=c(2,2))
plot(model$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-1008.77 -98.49 -27.14 67.95 969.39
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 122.55929 21.02473 5.829 1.64e-08 ***
CHits 3.51015 0.02168 161.878 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 227.5 on 261 degrees of freedom
Multiple R-squared: 0.9901, Adjusted R-squared: 0.9901
F-statistic: 2.62e+04 on 1 and 261 DF, p-value: < 2.2e-16
Call:
lm(formula = .outcome ~ ., data = dat)
Coefficients:
(Intercept) CHits
122.56 3.51
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-r0lKGOam-1680423860908)(output_25_2.png)]
通过上述四种方法对模型进行评估,选择最优模型。
#回归诊断图的理解
#残差与拟合图,本质上残差服从正态分布与估计值无关的假设,与估计值无关,残差应该在y=0上下随机波动
#QQ图用来检测残差是否服从正态分布
#方差相同,红线应该是水平波动不可以存在上下波动
#检查是否存在特别极端的点cook的内部即可
#使用par()函数在同一窗口中创建多个图,mfrow: 决定了网格的行值和列
#绘制散点图
plot(data$CHits,data$CAtBat)
#添加拟合直线abline(a,b,h,v)a,b指定线的截距和斜率、h为水平线指定y、v为垂直线指定x
abline(model$finalModel)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rgqmwN8H-1680423860908)(output_28_0.png)]
# 预测
#构造要预测数据,newdata的类型必须是dataframe结构,
#而且必须是与原来的名称相同
newdata <- data.frame(CHits = seq(3000,3100, 1))
#predict.lm函数进行预测:predict(object,newdata,interval)object代表的是模型对象、interval代表的是置信区间的类型
#confidence是对均值做区间估计、prediction是对随机变量做区间预测
pred=predict(fit.lm, newdata, interval = "prediction")
pred
fit | lwr | upr | |
---|---|---|---|
1 | 10767.46 | 10395.64 | 11139.27 |
2 | 10771.01 | 10399.18 | 11142.85 |
3 | 10774.57 | 10402.72 | 11146.42 |
4 | 10778.12 | 10406.25 | 11150.00 |
5 | 10781.68 | 10409.79 | 11153.57 |
6 | 10785.23 | 10413.33 | 11157.14 |
7 | 10788.79 | 10416.86 | 11160.72 |
8 | 10792.35 | 10420.40 | 11164.29 |
9 | 10795.90 | 10423.94 | 11167.86 |
10 | 10799.46 | 10427.47 | 11171.44 |
11 | 10803.01 | 10431.01 | 11175.01 |
12 | 10806.57 | 10434.55 | 11178.59 |
13 | 10810.12 | 10438.09 | 11182.16 |
14 | 10813.68 | 10441.62 | 11185.73 |
15 | 10817.23 | 10445.16 | 11189.31 |
16 | 10820.79 | 10448.70 | 11192.88 |
17 | 10824.34 | 10452.23 | 11196.46 |
18 | 10827.90 | 10455.77 | 11200.03 |
19 | 10831.45 | 10459.31 | 11203.60 |
20 | 10835.01 | 10462.84 | 11207.18 |
21 | 10838.57 | 10466.38 | 11210.75 |
22 | 10842.12 | 10469.92 | 11214.33 |
23 | 10845.68 | 10473.45 | 11217.90 |
24 | 10849.23 | 10476.99 | 11221.47 |
25 | 10852.79 | 10480.53 | 11225.05 |
26 | 10856.34 | 10484.06 | 11228.62 |
27 | 10859.90 | 10487.60 | 11232.19 |
28 | 10863.45 | 10491.14 | 11235.77 |
29 | 10867.01 | 10494.68 | 11239.34 |
30 | 10870.56 | 10498.21 | 11242.92 |
⋮ | ⋮ | ⋮ | ⋮ |
72 | 11019.89 | 10646.75 | 11393.03 |
73 | 11023.45 | 10650.29 | 11396.60 |
74 | 11027.00 | 10653.83 | 11400.18 |
75 | 11030.56 | 10657.36 | 11403.75 |
76 | 11034.11 | 10660.90 | 11407.33 |
77 | 11037.67 | 10664.43 | 11410.90 |
78 | 11041.22 | 10667.97 | 11414.47 |
79 | 11044.78 | 10671.51 | 11418.05 |
80 | 11048.33 | 10675.04 | 11421.62 |
81 | 11051.89 | 10678.58 | 11425.20 |
82 | 11055.44 | 10682.12 | 11428.77 |
83 | 11059.00 | 10685.65 | 11432.35 |
84 | 11062.56 | 10689.19 | 11435.92 |
85 | 11066.11 | 10692.73 | 11439.49 |
86 | 11069.67 | 10696.26 | 11443.07 |
87 | 11073.22 | 10699.80 | 11446.64 |
88 | 11076.78 | 10703.34 | 11450.22 |
89 | 11080.33 | 10706.87 | 11453.79 |
90 | 11083.89 | 10710.41 | 11457.37 |
91 | 11087.44 | 10713.94 | 11460.94 |
92 | 11091.00 | 10717.48 | 11464.52 |
93 | 11094.55 | 10721.02 | 11468.09 |
94 | 11098.11 | 10724.55 | 11471.66 |
95 | 11101.66 | 10728.09 | 11475.24 |
96 | 11105.22 | 10731.63 | 11478.81 |
97 | 11108.78 | 10735.16 | 11482.39 |
98 | 11112.33 | 10738.70 | 11485.96 |
99 | 11115.89 | 10742.24 | 11489.54 |
100 | 11119.44 | 10745.77 | 11493.11 |
101 | 11123.00 | 10749.31 | 11496.68 |
回归诊断
library(car)
par(mfrow=c(2,2))
qqPlot(fit.lm,id.method='identify',simulate=TRUE,main="Q-Q plot")
residplot<-function(fit,nbreaks=10){
z<-rstudent(fit)
hist(z,breaks=nbreaks,freq=FALSE,
xlab="Studnetized Residual",
main="Distribution of Errors")
rug(jitter(z),col="brown")
curve(dnorm(x,mean=mean(z),sd=sd(z)),
add=TRUE,col="blue",lwd=2)
lines(density(z)$x,density(z)$y,
col="red",lwd=2,lty=2)
legend("topright",
legend=c("Normal Curve","Kernel Density Curve"),
lty=1:2,col=c("blue","red"),cex=0.7)}
residplot(fit.lm)
# Durbin-Watson检验的函数,可检测误差的序列相关性
durbinWatsonTest(fit.lm)
# 可通过成分残差图即偏残差图,判断因变量与自变量之间是否呈非线性关系,也可以看是否不同于已设定线性模型的系统偏差,图形可用car包中crPlots()函数绘制
crPlots(fit.lm)
# VIF(Variance Inflation Factor,方差膨胀因子)进行检测
# 一般原则下,(VIF)^1/2 >2表明存在多重共线性问题
# vif(fit.lm)#此处只是一元线性回归所以没法用
# sqrt(vif(fit.lm))>2
outlierTest(fit.lm)
-
-Wade Boggs
- 124 -Dave Parker
- 136
lag Autocorrelation D-W Statistic p-value
1 -0.05576414 2.100299 0.528
Alternative hypothesis: rho != 0
rstudent unadjusted p-value Bonferroni p
-Wade Boggs -4.888515 2.5292e-06 0.00039709
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UTwwP8SS-1680423860909)(output_31_3.png)]
多项式回归
留出法
#设置随机种子
set.seed(1)
#划分训练集与测试集,从392个中选196个出来,这可以当做训练集
train=sample(dim(data)[1],dim(data)[1]*0.6)
#经上述查询发现CAtBat与CHits的线性程度最高,所以对二者进行线性拟合,
fit.lm<-lm(CAtBat~Years+I(Years^2),data,subset=train)
fit.lm
# Residuals—残差统计量、intercept-表示截距、Estimate-包含由普通最小二乘法计算出来的估计回归系数、Std.error-估计的回归系数的标准误差、
# Multiple R-squared-拟合优度越大越好、F-statistic-判断方程的显著性检验
summary(fit.lm)
Call:
lm(formula = CAtBat ~ Years + I(Years^2), data = data, subset = train)
Coefficients:
(Intercept) Years I(Years^2)
-174.98 350.94 2.27
Call:
lm(formula = CAtBat ~ Years + I(Years^2), data = data, subset = train)
Residuals:
Min 1Q Median 3Q Max
-2928.44 -393.39 41.81 561.16 2266.12
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -174.97 215.76 -0.811 0.419
Years 350.94 60.95 5.758 4.48e-08 ***
I(Years^2) 2.27 3.54 0.641 0.522
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 878.6 on 154 degrees of freedom
Multiple R-squared: 0.7792, Adjusted R-squared: 0.7763
F-statistic: 271.7 on 2 and 154 DF, p-value: < 2.2e-16
#绑定数据直接使用列名进行访问
attach(data)
#计算测试均方误差
mean((CAtBat-predict(fit.lm,data))[-train]^2)
# 重复运用验证集方法10次
err1=rep(0,10)
for ( i in 1 : 10 ) {
train2 <- sample ( dim(data)[1] , dim(data)[1]*0.6 )
lmfit2 <- lm ( CAtBat~CHits,data,subset=train2 )
pred2 <- predict ( lmfit2 , Auto [ - train2 , ] )
err1 [ i ] <- mean ( ( CAtBat [ - train2 ] - pred2 ) ^ 2 )
}
plot ( 1 : 10 , err1 , xlab = "" , type = "l" ,
main = "选取10个不同的训练集对应的测试误差" )
The following objects are masked from data (pos = 13):
Assists, AtBat, CAtBat, CHits, CHmRun, CRBI, CRuns, CWalks,
Division, Errors, Hits, HmRun, League, NewLeague, PutOuts, RBI,
Runs, Salary, Walks, Years
1034057.64239113
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vkwPYkFh-1680423860910)(output_35_2.png)]
交叉检验——留一法
#对数据进行预处理按照年份求取均值,在进行拟合
df<-data.frame(data[,c(7,8)])
df2<-aggregate(df$CAtBat,by=list(type=df$Years),mean)
library(tidyverse)
df2=rename(df2,c('Years'=type,'CAtBat'=x))
#定义方法为留一法
train.control <- trainControl(method='LOOCV')
#训练模型,“lm”表示选用线性回归模型
model <- train(CAtBat~Years+I(Years^2)+I(Years^3),df2,method='lm',trControl = train.control)
model
model$finalModel
plot(df2$Years,df2$CAtBat)
lines(df2$Years,fitted(model$finalModel))
par(mfrow=c(2,2))
plot(model$finalModel)
Linear Regression
21 samples
1 predictor
No pre-processing
Resampling: Leave-One-Out Cross-Validation
Summary of sample sizes: 20, 20, 20, 20, 20, 20, ...
Resampling results:
RMSE Rsquared MAE
607.3668 0.9706062 354.2228
Tuning parameter 'intercept' was held constant at a value of TRUE
Call:
lm(formula = .outcome ~ ., data = dat)
Coefficients:
(Intercept) Years `I(Years^2)` `I(Years^3)`
-589.818 591.534 -29.969 1.262
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-twPjDGjQ-1680423860910)(output_38_2.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MM409tU4-1680423860911)(output_38_3.png)]
尝试后,可能三次较好
多元线性回归:高阶的、交互项的
逐步回归
lm.fit=lm(Salary~.,data)
#逐步回归
lm.step1=step(lm.fit)
Start: AIC=3035.44
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League +
Division + PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- CHmRun 1 973 24187888 3033.4
- CHits 1 4570 24191486 3033.5
- NewLeague 1 10330 24197245 3033.6
- Years 1 10569 24197485 3033.6
- RBI 1 15126 24202042 3033.6
- HmRun 1 47407 24234323 3033.9
- Runs 1 58931 24245847 3034.1
- Errors 1 61915 24248830 3034.1
- League 1 62043 24248959 3034.1
- CRBI 1 133884 24320800 3034.9
- CAtBat 1 163910 24350826 3035.2
<none> 24186916 3035.4
- Assists 1 288343 24475259 3036.6
- CRuns 1 377536 24564451 3037.5
- CWalks 1 613995 24800910 3040.0
- Division 1 842840 25029755 3042.4
- AtBat 1 969060 25155975 3043.7
- Hits 1 970685 25157601 3043.8
- Walks 1 1146914 25333830 3045.6
- PutOuts 1 1294459 25481375 3047.1
Step: AIC=3033.45
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CRuns + CRBI + CWalks + League + Division +
PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- Years 1 10311 24198200 3031.6
- NewLeague 1 10797 24198686 3031.6
- RBI 1 14186 24202074 3031.6
- CHits 1 15061 24202949 3031.6
- HmRun 1 52012 24239900 3032.0
- Runs 1 60645 24248533 3032.1
- Errors 1 63117 24251005 3032.1
- League 1 63247 24251136 3032.1
- CAtBat 1 178916 24366805 3033.4
<none> 24187888 3033.4
- Assists 1 293831 24481719 3034.6
- CRuns 1 619537 24807425 3038.1
- CWalks 1 651484 24839372 3038.4
- Division 1 843039 25030928 3040.4
- CRBI 1 865749 25053637 3040.7
- AtBat 1 968661 25156549 3041.7
- Hits 1 1006060 25193949 3042.1
- Walks 1 1158436 25346324 3043.7
- PutOuts 1 1299490 25487379 3045.2
Step: AIC=3031.56
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat +
CHits + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- NewLeague 1 10341 24208540 3029.7
- RBI 1 15435 24213635 3029.7
- CHits 1 19987 24218187 3029.8
- HmRun 1 53660 24251860 3030.1
- Runs 1 59127 24257327 3030.2
- Errors 1 60105 24258304 3030.2
- League 1 65925 24264125 3030.3
<none> 24198200 3031.6
- CAtBat 1 275224 24473424 3032.5
- Assists 1 301166 24499366 3032.8
- CWalks 1 654794 24852994 3036.6
- CRuns 1 655377 24853577 3036.6
- Division 1 833693 25031893 3038.4
- CRBI 1 874410 25072610 3038.9
- AtBat 1 961565 25159764 3039.8
- Hits 1 996477 25194677 3040.1
- Walks 1 1155832 25354031 3041.8
- PutOuts 1 1312785 25510984 3043.4
Step: AIC=3029.67
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat +
CHits + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors
Df Sum of Sq RSS AIC
- RBI 1 15256 24223796 3027.8
- CHits 1 17373 24225913 3027.9
- HmRun 1 54476 24263016 3028.3
- Errors 1 57130 24265671 3028.3
- Runs 1 58326 24266866 3028.3
- League 1 105730 24314271 3028.8
<none> 24208540 3029.7
- CAtBat 1 270031 24478572 3030.6
- Assists 1 303082 24511622 3030.9
- CWalks 1 654258 24862798 3034.7
- CRuns 1 670419 24878960 3034.8
- Division 1 830667 25039208 3036.5
- CRBI 1 882812 25091352 3037.1
- AtBat 1 992385 25200926 3038.2
- Hits 1 1013020 25221560 3038.4
- Walks 1 1148399 25356939 3039.8
- PutOuts 1 1313746 25522287 3041.5
Step: AIC=3027.84
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CHits +
CRuns + CRBI + CWalks + League + Division + PutOuts + Assists +
Errors
Df Sum of Sq RSS AIC
- CHits 1 14968 24238763 3026.0
- HmRun 1 44993 24268789 3026.3
- Runs 1 50436 24274232 3026.4
- Errors 1 60377 24284173 3026.5
- League 1 106822 24330618 3027.0
<none> 24223796 3027.8
- CAtBat 1 261759 24485555 3028.7
- Assists 1 302461 24526257 3029.1
- CWalks 1 645358 24869154 3032.7
- CRuns 1 702809 24926605 3033.3
- Division 1 815566 25039362 3034.5
- CRBI 1 897559 25121355 3035.4
- Hits 1 1014099 25237895 3036.6
- AtBat 1 1035085 25258881 3036.8
- Walks 1 1137010 25360806 3037.9
- PutOuts 1 1318690 25542486 3039.7
Step: AIC=3026
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CRuns +
CRBI + CWalks + League + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- HmRun 1 40626 24279390 3024.4
- Errors 1 54053 24292816 3024.6
- Runs 1 76176 24314940 3024.8
- League 1 113298 24352062 3025.2
<none> 24238763 3026.0
- Assists 1 290031 24528795 3027.1
- CAtBat 1 623235 24861999 3030.7
- Division 1 807193 25045957 3032.6
- CRBI 1 907050 25145813 3033.6
- CWalks 1 1021560 25260323 3034.8
- Walks 1 1241594 25480357 3037.1
- AtBat 1 1337687 25576451 3038.1
- PutOuts 1 1387677 25626441 3038.6
- CRuns 1 1395747 25634510 3038.7
- Hits 1 1592616 25831379 3040.7
Step: AIC=3024.44
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- Errors 1 46032 24325422 3022.9
- Runs 1 46359 24325749 3022.9
- League 1 102449 24381839 3023.5
<none> 24279390 3024.4
- Assists 1 253322 24532712 3025.2
- CAtBat 1 662504 24941894 3029.5
- Division 1 801292 25080681 3030.9
- CWalks 1 992348 25271738 3032.9
- Walks 1 1201134 25480524 3035.1
- AtBat 1 1298852 25578242 3036.1
- CRuns 1 1356074 25635464 3036.7
- CRBI 1 1358716 25638106 3036.7
- PutOuts 1 1410607 25689996 3037.2
- Hits 1 1558262 25837652 3038.7
Step: AIC=3022.94
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- Runs 1 51651 24377074 3021.5
- League 1 89806 24415228 3021.9
<none> 24325422 3022.9
- Assists 1 224409 24549831 3023.3
- CAtBat 1 658222 24983644 3027.9
- Division 1 804432 25129854 3029.5
- CWalks 1 978988 25304410 3031.3
- Walks 1 1235957 25561379 3033.9
- CRBI 1 1335850 25661272 3034.9
- CRuns 1 1362206 25687628 3035.2
- PutOuts 1 1372903 25698325 3035.3
- AtBat 1 1377934 25703357 3035.4
- Hits 1 1636145 25961567 3038.0
Step: AIC=3021.49
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
League + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- League 1 110612 24487686 3020.7
<none> 24377074 3021.5
- Assists 1 285597 24662671 3022.5
- CAtBat 1 606762 24983836 3025.9
- Division 1 786536 25163610 3027.8
- CWalks 1 956574 25333648 3029.6
- Walks 1 1213748 25590821 3032.2
- CRuns 1 1334789 25711863 3033.5
- CRBI 1 1365809 25742883 3033.8
- PutOuts 1 1431602 25808675 3034.4
- AtBat 1 1519201 25896274 3035.3
- Hits 1 1712234 26089308 3037.3
Step: AIC=3020.68
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
Division + PutOuts + Assists
Df Sum of Sq RSS AIC
<none> 24487686 3020.7
- Assists 1 319213 24806899 3022.1
- CAtBat 1 546871 25034557 3024.5
- Division 1 805422 25293108 3027.2
- CWalks 1 977622 25465308 3028.9
- CRuns 1 1270026 25757712 3031.9
- Walks 1 1290266 25777951 3032.1
- CRBI 1 1332094 25819779 3032.6
- PutOuts 1 1523160 26010846 3034.5
- AtBat 1 1584954 26072640 3035.1
- Hits 1 1708829 26196515 3036.3
#逐步回归
lm.step2=step(lm.fit)
#统计最好的模型
summary(lm.step2)
Start: AIC=3035.44
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League +
Division + PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- CHmRun 1 973 24187888 3033.4
- CHits 1 4570 24191486 3033.5
- NewLeague 1 10330 24197245 3033.6
- Years 1 10569 24197485 3033.6
- RBI 1 15126 24202042 3033.6
- HmRun 1 47407 24234323 3033.9
- Runs 1 58931 24245847 3034.1
- Errors 1 61915 24248830 3034.1
- League 1 62043 24248959 3034.1
- CRBI 1 133884 24320800 3034.9
- CAtBat 1 163910 24350826 3035.2
<none> 24186916 3035.4
- Assists 1 288343 24475259 3036.6
- CRuns 1 377536 24564451 3037.5
- CWalks 1 613995 24800910 3040.0
- Division 1 842840 25029755 3042.4
- AtBat 1 969060 25155975 3043.7
- Hits 1 970685 25157601 3043.8
- Walks 1 1146914 25333830 3045.6
- PutOuts 1 1294459 25481375 3047.1
Step: AIC=3033.45
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
CAtBat + CHits + CRuns + CRBI + CWalks + League + Division +
PutOuts + Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- Years 1 10311 24198200 3031.6
- NewLeague 1 10797 24198686 3031.6
- RBI 1 14186 24202074 3031.6
- CHits 1 15061 24202949 3031.6
- HmRun 1 52012 24239900 3032.0
- Runs 1 60645 24248533 3032.1
- Errors 1 63117 24251005 3032.1
- League 1 63247 24251136 3032.1
- CAtBat 1 178916 24366805 3033.4
<none> 24187888 3033.4
- Assists 1 293831 24481719 3034.6
- CRuns 1 619537 24807425 3038.1
- CWalks 1 651484 24839372 3038.4
- Division 1 843039 25030928 3040.4
- CRBI 1 865749 25053637 3040.7
- AtBat 1 968661 25156549 3041.7
- Hits 1 1006060 25193949 3042.1
- Walks 1 1158436 25346324 3043.7
- PutOuts 1 1299490 25487379 3045.2
Step: AIC=3031.56
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat +
CHits + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors + NewLeague
Df Sum of Sq RSS AIC
- NewLeague 1 10341 24208540 3029.7
- RBI 1 15435 24213635 3029.7
- CHits 1 19987 24218187 3029.8
- HmRun 1 53660 24251860 3030.1
- Runs 1 59127 24257327 3030.2
- Errors 1 60105 24258304 3030.2
- League 1 65925 24264125 3030.3
<none> 24198200 3031.6
- CAtBat 1 275224 24473424 3032.5
- Assists 1 301166 24499366 3032.8
- CWalks 1 654794 24852994 3036.6
- CRuns 1 655377 24853577 3036.6
- Division 1 833693 25031893 3038.4
- CRBI 1 874410 25072610 3038.9
- AtBat 1 961565 25159764 3039.8
- Hits 1 996477 25194677 3040.1
- Walks 1 1155832 25354031 3041.8
- PutOuts 1 1312785 25510984 3043.4
Step: AIC=3029.67
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat +
CHits + CRuns + CRBI + CWalks + League + Division + PutOuts +
Assists + Errors
Df Sum of Sq RSS AIC
- RBI 1 15256 24223796 3027.8
- CHits 1 17373 24225913 3027.9
- HmRun 1 54476 24263016 3028.3
- Errors 1 57130 24265671 3028.3
- Runs 1 58326 24266866 3028.3
- League 1 105730 24314271 3028.8
<none> 24208540 3029.7
- CAtBat 1 270031 24478572 3030.6
- Assists 1 303082 24511622 3030.9
- CWalks 1 654258 24862798 3034.7
- CRuns 1 670419 24878960 3034.8
- Division 1 830667 25039208 3036.5
- CRBI 1 882812 25091352 3037.1
- AtBat 1 992385 25200926 3038.2
- Hits 1 1013020 25221560 3038.4
- Walks 1 1148399 25356939 3039.8
- PutOuts 1 1313746 25522287 3041.5
Step: AIC=3027.84
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CHits +
CRuns + CRBI + CWalks + League + Division + PutOuts + Assists +
Errors
Df Sum of Sq RSS AIC
- CHits 1 14968 24238763 3026.0
- HmRun 1 44993 24268789 3026.3
- Runs 1 50436 24274232 3026.4
- Errors 1 60377 24284173 3026.5
- League 1 106822 24330618 3027.0
<none> 24223796 3027.8
- CAtBat 1 261759 24485555 3028.7
- Assists 1 302461 24526257 3029.1
- CWalks 1 645358 24869154 3032.7
- CRuns 1 702809 24926605 3033.3
- Division 1 815566 25039362 3034.5
- CRBI 1 897559 25121355 3035.4
- Hits 1 1014099 25237895 3036.6
- AtBat 1 1035085 25258881 3036.8
- Walks 1 1137010 25360806 3037.9
- PutOuts 1 1318690 25542486 3039.7
Step: AIC=3026
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CRuns +
CRBI + CWalks + League + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- HmRun 1 40626 24279390 3024.4
- Errors 1 54053 24292816 3024.6
- Runs 1 76176 24314940 3024.8
- League 1 113298 24352062 3025.2
<none> 24238763 3026.0
- Assists 1 290031 24528795 3027.1
- CAtBat 1 623235 24861999 3030.7
- Division 1 807193 25045957 3032.6
- CRBI 1 907050 25145813 3033.6
- CWalks 1 1021560 25260323 3034.8
- Walks 1 1241594 25480357 3037.1
- AtBat 1 1337687 25576451 3038.1
- PutOuts 1 1387677 25626441 3038.6
- CRuns 1 1395747 25634510 3038.7
- Hits 1 1592616 25831379 3040.7
Step: AIC=3024.44
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists + Errors
Df Sum of Sq RSS AIC
- Errors 1 46032 24325422 3022.9
- Runs 1 46359 24325749 3022.9
- League 1 102449 24381839 3023.5
<none> 24279390 3024.4
- Assists 1 253322 24532712 3025.2
- CAtBat 1 662504 24941894 3029.5
- Division 1 801292 25080681 3030.9
- CWalks 1 992348 25271738 3032.9
- Walks 1 1201134 25480524 3035.1
- AtBat 1 1298852 25578242 3036.1
- CRuns 1 1356074 25635464 3036.7
- CRBI 1 1358716 25638106 3036.7
- PutOuts 1 1410607 25689996 3037.2
- Hits 1 1558262 25837652 3038.7
Step: AIC=3022.94
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- Runs 1 51651 24377074 3021.5
- League 1 89806 24415228 3021.9
<none> 24325422 3022.9
- Assists 1 224409 24549831 3023.3
- CAtBat 1 658222 24983644 3027.9
- Division 1 804432 25129854 3029.5
- CWalks 1 978988 25304410 3031.3
- Walks 1 1235957 25561379 3033.9
- CRBI 1 1335850 25661272 3034.9
- CRuns 1 1362206 25687628 3035.2
- PutOuts 1 1372903 25698325 3035.3
- AtBat 1 1377934 25703357 3035.4
- Hits 1 1636145 25961567 3038.0
Step: AIC=3021.49
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
League + Division + PutOuts + Assists
Df Sum of Sq RSS AIC
- League 1 110612 24487686 3020.7
<none> 24377074 3021.5
- Assists 1 285597 24662671 3022.5
- CAtBat 1 606762 24983836 3025.9
- Division 1 786536 25163610 3027.8
- CWalks 1 956574 25333648 3029.6
- Walks 1 1213748 25590821 3032.2
- CRuns 1 1334789 25711863 3033.5
- CRBI 1 1365809 25742883 3033.8
- PutOuts 1 1431602 25808675 3034.4
- AtBat 1 1519201 25896274 3035.3
- Hits 1 1712234 26089308 3037.3
Step: AIC=3020.68
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
Division + PutOuts + Assists
Df Sum of Sq RSS AIC
<none> 24487686 3020.7
- Assists 1 319213 24806899 3022.1
- CAtBat 1 546871 25034557 3024.5
- Division 1 805422 25293108 3027.2
- CWalks 1 977622 25465308 3028.9
- CRuns 1 1270026 25757712 3031.9
- Walks 1 1290266 25777951 3032.1
- CRBI 1 1332094 25819779 3032.6
- PutOuts 1 1523160 26010846 3034.5
- AtBat 1 1584954 26072640 3035.1
- Hits 1 1708829 26196515 3036.3
Call:
lm(formula = Salary ~ AtBat + Hits + Walks + CAtBat + CRuns +
CRBI + CWalks + Division + PutOuts + Assists, data = data)
Residuals:
Min 1Q Median 3Q Max
-940.62 -177.71 -33.17 134.08 1909.31
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 163.24108 67.05209 2.435 0.015608 *
AtBat -2.16562 0.53729 -4.031 7.38e-05 ***
Hits 6.90505 1.64989 4.185 3.94e-05 ***
Walks 5.77344 1.58757 3.637 0.000335 ***
CAtBat -0.13315 0.05624 -2.368 0.018664 *
CRuns 1.43365 0.39735 3.608 0.000372 ***
CRBI 0.77607 0.21003 3.695 0.000270 ***
CWalks -0.83874 0.26496 -3.166 0.001739 **
DivisionW -112.96345 39.31545 -2.873 0.004410 **
PutOuts 0.29542 0.07477 3.951 0.000101 ***
Assists 0.28603 0.15813 1.809 0.071670 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 312.3 on 251 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.5407, Adjusted R-squared: 0.5224
F-statistic: 29.55 on 10 and 251 DF, p-value: < 2.2e-16
留出法
#通过如下命令查看工资水平到底和那个有关系
summary(lm(Salary~.,data))
Call:
lm(formula = Salary ~ ., data = data)
Residuals:
Min 1Q Median 3Q Max
-908.37 -178.95 -31.67 141.08 1874.33
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 167.10276 91.57537 1.825 0.069271 .
AtBat -1.97769 0.63513 -3.114 0.002069 **
Hits 7.44005 2.38737 3.116 0.002052 **
HmRun 4.27968 6.21401 0.689 0.491662
Runs -2.29854 2.99337 -0.768 0.443310
RBI -1.01413 2.60683 -0.389 0.697598
Walks 6.20858 1.83278 3.388 0.000823 ***
Years -4.07600 12.53431 -0.325 0.745320
CAtBat -0.17368 0.13562 -1.281 0.201553
CHits 0.14463 0.67636 0.214 0.830857
CHmRun -0.15985 1.62049 -0.099 0.921502
CRuns 1.46166 0.75205 1.944 0.053109 .
CRBI 0.80318 0.69396 1.157 0.248251
CWalks -0.81493 0.32879 -2.479 0.013874 *
LeagueN 62.56007 79.40244 0.788 0.431534
DivisionW -117.56598 40.48478 -2.904 0.004025 **
PutOuts 0.27988 0.07777 3.599 0.000388 ***
Assists 0.37754 0.22228 1.699 0.090694 .
Errors -3.47048 4.40935 -0.787 0.432010
NewLeagueN -25.45031 79.16482 -0.321 0.748120
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 316.1 on 242 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.5463, Adjusted R-squared: 0.5107
F-statistic: 15.34 on 19 and 242 DF, p-value: < 2.2e-16
#设置随机种子
set.seed(1)
#划分训练集与测试集,从392个中选196个出来,这可以当做训练集
train=sample(dim(data)[1],dim(data)[1]*0.6)
#经上述查询发现CAtBat与CHits的线性程度最高,所以对二者进行线性拟合,
fit.lm<-lm(Salary~AtBat+Hits+Walks+PutOuts,data,subset=train)
fit.lm
# Residuals—残差统计量、intercept-表示截距、Estimate-包含由普通最小二乘法计算出来的估计回归系数、Std.error-估计的回归系数的标准误差、
# Multiple R-squared-拟合优度越大越好、F-statistic-判断方程的显著性检验
summary(fit.lm)
Call:
lm(formula = Salary ~ AtBat + Hits + Walks + PutOuts, data = data,
subset = train)
Coefficients:
(Intercept) AtBat Hits Walks PutOuts
127.3390 -1.9234 7.1114 8.0472 0.2953
Call:
lm(formula = Salary ~ AtBat + Hits + Walks + PutOuts, data = data,
subset = train)
Residuals:
Min 1Q Median 3Q Max
-896.36 -217.42 -64.84 176.82 2008.31
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 127.3390 93.2714 1.365 0.17419
AtBat -1.9234 0.8224 -2.339 0.02065 *
Hits 7.1114 2.6684 2.665 0.00853 **
Walks 8.0472 1.9202 4.191 4.7e-05 ***
PutOuts 0.2953 0.1192 2.479 0.01429 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 381 on 152 degrees of freedom
Multiple R-squared: 0.3069, Adjusted R-squared: 0.2887
F-statistic: 16.83 on 4 and 152 DF, p-value: 1.935e-11
#绑定数据直接使用列名进行访问
attach(data)
#计算测试均方误差
mean((Salary-predict(fit.lm,data))[-train]^2)
# 重复运用验证集方法10次
err1=rep(0,10)
for ( i in 1 : 10 ) {
train2 <- sample ( dim(data)[1] , dim(data)[1]*0.6 )
lmfit2 <- lm (Salary~AtBat+Hits+Walks+PutOuts,data,subset=train2 )
pred2 <- predict ( lmfit2 , Auto [ - train2 , ] )
err1 [ i ] <- mean ( ( Salary [ - train2 ] - pred2 ) ^ 2 )
}
plot ( 1 : 10 , err1 , xlab = "" , type = "l" , main = "选取10个不同的训练集对应的测试误差" )
The following objects are masked from data (pos = 3):
Assists, AtBat, CAtBat, CHits, CHmRun, CRBI, CRuns, CWalks,
Division, Errors, Hits, HmRun, League, NewLeague, PutOuts, RBI,
Runs, Salary, Walks, Years
The following objects are masked from data (pos = 14):
Assists, AtBat, CAtBat, CHits, CHmRun, CRBI, CRuns, CWalks,
Division, Errors, Hits, HmRun, League, NewLeague, PutOuts, RBI,
Runs, Salary, Walks, Years
147498.365090483
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GVEJSp5m-1680423860913)(output_47_2.png)]
留一法
#定义方法为留一法
train.control <- trainControl(method='LOOCV')
#训练模型,“lm”表示选用线性回归模型
model <- train(Salary~AtBat+Hits+Walks+PutOuts,data,method='lm',trControl = train.control)
model
model$finalModel
par(mfrow=c(2,2))
plot(model$finalModel)
Linear Regression
263 samples
4 predictor
No pre-processing
Resampling: Leave-One-Out Cross-Validation
Summary of sample sizes: 262, 262, 262, 262, 262, 262, ...
Resampling results:
RMSE Rsquared MAE
387.4613 0.2607978 278.2137
Tuning parameter 'intercept' was held constant at a value of TRUE
Call:
lm(formula = .outcome ~ ., data = dat)
Coefficients:
(Intercept) AtBat Hits Walks PutOuts
107.9885 -2.1739 8.8471 6.6693 0.2657
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7nA34cEq-1680423860913)(output_49_2.png)]
留一法的拟合优度R方很小,比留出法还差
K折交叉检验
#定义训练模型,设置随机种子,以k=10为例子,重复10次
set.seed(123)
train.control <- trainControl(method ="cv",number=10)
#训练模型
model<- train(Salary~AtBat+Hits+Walks+PutOuts,data,method="lm",trControl = train.control)
summary(model$finalModel)
par(mfrow=c(2,2))
plot(model$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-990.80 -228.07 -64.33 176.96 2033.25
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 107.9885 72.2842 1.494 0.136413
AtBat -2.1739 0.6236 -3.486 0.000576 ***
Hits 8.8471 1.9625 4.508 9.93e-06 ***
Walks 6.6693 1.3985 4.769 3.10e-06 ***
PutOuts 0.2657 0.0889 2.988 0.003074 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 380.3 on 258 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2892
F-statistic: 27.65 on 4 and 258 DF, p-value: < 2.2e-16
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ciQSgOik-1680423860914)(output_52_1.png)]
重复K次交叉验证
#定义训练模型,设置随机种子,以k=10为例子,重复10次
set.seed(123)
# train.control <- trainControl(method ="repeatedcv",number=10,repeats=3)
train.control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
#训练模型
model<- train(Salary~AtBat+Hits+Walks+PutOuts,data,method="lm",trControl = train.control)
summary(model$finalModel)
#提取系数的置信区间
confint(model$finalModel)
par(mfrow=c(2,2))
plot(model$finalModel)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-990.80 -228.07 -64.33 176.96 2033.25
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 107.9885 72.2842 1.494 0.136413
AtBat -2.1739 0.6236 -3.486 0.000576 ***
Hits 8.8471 1.9625 4.508 9.93e-06 ***
Walks 6.6693 1.3985 4.769 3.10e-06 ***
PutOuts 0.2657 0.0889 2.988 0.003074 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 380.3 on 258 degrees of freedom
Multiple R-squared: 0.3, Adjusted R-squared: 0.2892
F-statistic: 27.65 on 4 and 258 DF, p-value: < 2.2e-16
2.5 % | 97.5 % | |
---|---|---|
(Intercept) | -34.35369004 | 250.3306400 |
AtBat | -3.40188946 | -0.9459061 |
Hits | 4.98255201 | 12.7117007 |
Walks | 3.91540097 | 9.4232171 |
PutOuts | 0.09061403 | 0.4407509 |
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GKFmdmpj-1680423860914)(output_54_2.png)]
K折交叉检验虽然对拟合优度有所提升但是,拟合效果依旧很差
#带交互项的回归
#注意:变量x和y在lm拟合中,x*y和x:y的意思是不一样的。
summary(lm(Salary~AtBat*PutOuts,data)) #单变量+交互
par(mfrow=c(2,2))
plot(lm(Salary~AtBat*PutOuts,data))
Call:
lm(formula = Salary ~ AtBat * PutOuts, data = data)
Residuals:
Min 1Q Median 3Q Max
-923.42 -248.32 -51.22 182.19 2002.85
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.070e+02 1.096e+02 0.976 0.33001
AtBat 8.524e-01 2.554e-01 3.337 0.00097 ***
PutOuts -7.263e-03 3.603e-01 -0.020 0.98393
AtBat:PutOuts 6.691e-04 7.158e-04 0.935 0.35077
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 407.4 on 259 degrees of freedom
Multiple R-squared: 0.1937, Adjusted R-squared: 0.1844
F-statistic: 20.74 on 3 and 259 DF, p-value: 4.476e-12
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iWMoB9TA-1680423860915)(output_56_1.png)]
广义线性回归
对数回归
#对salary取对数进行回归
fit.lm=lm(log(Salary)~Walks,data)
summary(fit.lm)
Call:
lm(formula = log(Salary) ~ Walks, data = data)
Residuals:
Min 1Q Median 3Q Max
-1.8512 -0.6127 0.1142 0.6251 2.4633
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.199290 0.106206 48.955 < 2e-16 ***
Walks 0.017705 0.002285 7.748 2.07e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.8033 on 261 degrees of freedom
Multiple R-squared: 0.187, Adjusted R-squared: 0.1839
F-statistic: 60.03 on 1 and 261 DF, p-value: 2.075e-13
Logistic回归
留出法
#将数据分成训练集和测试集:%>%将数据进行传递、
set.seed(123)
training.samples<- Hitters$NewLeague %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- Hitters[training.samples,]
test.data <- Hitters[-training.samples,]
glm.fits_1<-glm(NewLeague~.,train.data,family=binomial(link='logit'))
summary(glm.fits_1)
glm.probs=predict(glm.fits_1,test.data,type="response")
#查看到底哪个是0,logistic有一个默认的规则
contrasts(train.data$NewLeague)
#创建预测结果存储的地方,rep函数进行复制,
glm.pred=rep("A",dim(test.data)[1])
#全部的分类结果,设置了阈值为0.5
glm.pred[glm.probs>.5]='N'
#混淆矩阵
table(glm.pred,test.data$NewLeague)
#计算正确率
mean(glm.pred==test.data$NewLeague)
Call:
glm(formula = NewLeague ~ ., family = binomial(link = "logit"),
data = train.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3871 -0.2839 -0.1428 0.3762 3.1521
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.1111835 1.3979353 -2.941 0.00327 **
AtBat 0.0139516 0.0099547 1.402 0.16106
Hits -0.0238494 0.0385859 -0.618 0.53652
HmRun -0.0230690 0.0950296 -0.243 0.80819
Runs -0.0422197 0.0462859 -0.912 0.36169
RBI 0.0088451 0.0384175 0.230 0.81791
Walks 0.0303770 0.0303220 1.002 0.31643
Years -0.2004386 0.1811425 -1.107 0.26850
CAtBat -0.0018551 0.0023725 -0.782 0.43427
CHits 0.0156182 0.0118849 1.314 0.18881
CHmRun 0.0062912 0.0257702 0.244 0.80713
CRuns -0.0118422 0.0101635 -1.165 0.24395
CRBI -0.0063914 0.0115327 -0.554 0.57945
CWalks 0.0030614 0.0056898 0.538 0.59055
LeagueN 5.7242274 0.7498249 7.634 2.27e-14 ***
DivisionW 0.0555476 0.6396310 0.087 0.93080
PutOuts -0.0004156 0.0011629 -0.357 0.72080
Assists -0.0033126 0.0036206 -0.915 0.36023
Errors -0.0074453 0.0649022 -0.115 0.90867
Salary -0.0007739 0.0009779 -0.791 0.42875
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 281.214 on 203 degrees of freedom
Residual deviance: 91.518 on 184 degrees of freedom
(54 observations deleted due to missingness)
AIC: 131.52
Number of Fisher Scoring iterations: 6
N | |
---|---|
A | 0 |
N | 1 |
glm.pred A N
A 34 3
N 1 26
0.9375
正确率达到0.9375,
glm.fits_1<-glm(NewLeague~.,train.data,family=binomial(link='probit'))
summary(glm.fits_1)
glm.probs=predict(glm.fits_1,test.data,type="response")
#查看到底哪个是0,logistic有一个默认的规则
contrasts(train.data$NewLeague)
#创建预测结果存储的地方,rep函数进行复制,
glm.pred=rep("A",dim(test.data)[1])
#全部的分类结果
glm.pred[glm.probs>.5]='N'
#混淆矩阵
table(glm.pred,test.data$NewLeague)
#计算正确率
mean(glm.pred==test.data$NewLeague)
Call:
glm(formula = NewLeague ~ ., family = binomial(link = "probit"),
data = train.data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3198 -0.2862 -0.1204 0.3753 2.9415
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.0228580 0.6800483 -2.975 0.00293 **
AtBat 0.0055448 0.0048246 1.149 0.25044
Hits -0.0083577 0.0188149 -0.444 0.65689
HmRun -0.0018123 0.0464151 -0.039 0.96885
Runs -0.0223190 0.0228722 -0.976 0.32916
RBI 0.0006059 0.0191925 0.032 0.97482
Walks 0.0186796 0.0147079 1.270 0.20407
Years -0.1128607 0.0927634 -1.217 0.22374
CAtBat -0.0006854 0.0011552 -0.593 0.55297
CHits 0.0071548 0.0059495 1.203 0.22913
CHmRun 0.0038043 0.0128131 0.297 0.76654
CRuns -0.0056003 0.0052602 -1.065 0.28703
CRBI -0.0028734 0.0058602 -0.490 0.62391
CWalks 0.0003207 0.0027509 0.117 0.90719
LeagueN 3.1376015 0.3388132 9.261 < 2e-16 ***
DivisionW -0.0295131 0.3149171 -0.094 0.92533
PutOuts -0.0001025 0.0005727 -0.179 0.85799
Assists -0.0013616 0.0017130 -0.795 0.42669
Errors -0.0053971 0.0317883 -0.170 0.86518
Salary -0.0004174 0.0004950 -0.843 0.39903
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 281.214 on 203 degrees of freedom
Residual deviance: 91.837 on 184 degrees of freedom
(54 observations deleted due to missingness)
AIC: 131.84
Number of Fisher Scoring iterations: 7
N | |
---|---|
A | 0 |
N | 1 |
glm.pred A N
A 34 3
N 1 26
0.9375
probit 与logistic的效果差不多
交叉验证法(待修改)
set.seed(123)
n=10 #模型阶数
cv.error.10=rep(0,n)
# for (i in 1:n){
# glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
# cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
# }
for (i in 1:n){
glm.fit=glm(NewLeague~.,data,family=binomial(link='probit'))
cv.error.10[i]=cv.glm(data,glm.fit,K=10)$delta[1]
}
cv.error.10
Error in cv.glm(data, glm.fit, K = 10): could not find function "cv.glm"
Traceback:
library(nnet)
#建立模型
model<- nnet::multinom(NewLeague~AtBat+Hits+Walks+PutOuts,train.data)
summary(model)
# weights: 6 (5 variable)
initial value 178.831973
final value 176.202102
converged
Call:
nnet::multinom(formula = NewLeague ~ AtBat + Hits + Walks + PutOuts,
data = train.data)
Coefficients:
Values Std. Err.
(Intercept) 0.1680834713 0.3600629960
AtBat -0.0008943925 0.0033379907
Hits -0.0027225518 0.0105738112
Walks 0.0034700966 0.0080661133
PutOuts 0.0004106612 0.0004562542
Residual Deviance: 352.4042
AIC: 362.4042
predicted.classes<-model %>%
predict(test.data)
head(predicted.classes)
mean(predicted.classes == test.data$NewLeague)
- N
- A
- A
- A
- A
- N
- 'A'
- 'N'
0.53125
当减少变量时明显发现,准确率下降,猜测变量中有一个与因变量高度相关
线性判别分析
#将数据分成训练集和测试集:%>%将数据进行传递、
library(ipred)
set.seed(123)
data=na.omit(Hitters)
training.samples<- data$NewLeague %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- data[training.samples,]
test.data <- data[-training.samples,]
#导入包时底层逻辑出现问题
unloadNamespace('caret')
unloadNamespace('recipes')
unloadNamespace('ipred')
#使用lda
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
(lda.fit=lda(NewLeague~.,train.data))
plot(lda.fit)
Call:
lda(NewLeague ~ ., data = train.data)
Prior probabilities of groups:
A N
0.535545 0.464455
Group means:
AtBat Hits HmRun Runs RBI Walks Years CAtBat
A 406.0885 109.1504 12.592920 57.00000 52.77876 41.07080 7.283186 2637.832
N 385.7041 102.1122 9.479592 49.59184 46.96939 40.52041 7.428571 2751.612
CHits CHmRun CRuns CRBI CWalks LeagueN DivisionW PutOuts
A 715.6018 72.73451 367.2212 332.5664 258.3186 0.07964602 0.4778761 256.9646
N 747.1224 61.76531 358.2959 330.0918 263.3673 0.94897959 0.5204082 303.2551
Assists Errors Salary
A 113.4425 8.362832 521.8844
N 123.2143 8.846939 537.6504
Coefficients of linear discriminants:
LD1
AtBat 0.0028002936
Hits -0.0031493663
HmRun 0.0104988252
Runs -0.0107130571
RBI -0.0024390763
Walks 0.0048200984
Years -0.0460721796
CAtBat -0.0000254391
CHits 0.0014060531
CHmRun 0.0001388015
CRuns -0.0015124610
CRBI -0.0006446335
CWalks 0.0002057631
LeagueN 4.0456150691
DivisionW 0.0351811062
PutOuts 0.0003566468
Assists 0.0002141323
Errors -0.0242090302
Salary -0.0001118049
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IQTJfnTg-1680423860917)(output_76_1.png)]
lda.pred=predict(lda.fit,test.data)
paste('变量名称=',names(lda.pred))
lda.class=lda.pred$class
head(lda.class)
lda.posterior=lda.pred$posterior
head(lda.posterior)
- '变量名称= class'
- '变量名称= posterior'
- '变量名称= x'
- A
- N
- N
- A
- N
- A
- 'A'
- 'N'
A | N | |
---|---|---|
-Alvin Davis | 0.9979368675 | 0.0020631325 |
-Andre Dawson | 0.0007505105 | 0.9992494895 |
-Andres Galarraga | 0.0003391899 | 0.9996608101 |
-Alfredo Griffin | 0.9997526519 | 0.0002473481 |
-Andres Thomas | 0.0017857792 | 0.9982142208 |
-Andre Thornton | 0.9996359302 | 0.0003640698 |
#查看混淆矩阵
table(lda.class,test.data$NewLeague)
#计算预测的准确率
mean(lda.class==test.data$NewLeague)
#当后验概率使用50%的阈值时,重新预测,结果包含在lda.pred$class中。
sum(lda.pred$posterior[,1]>=0.5)
sum(lda.pred$posterior[,1]<0.5)
#注意到模型的后验概率对应着A的
lda.pred$posterior[1:20,1:2 ]
lda.class[1:20]
#,后验概率为90%
sum(lda.pred$posterior[,1]>=0.9)
lda.class A N
A 27 3
N 1 21
0.923076923076923
30
22
A | N | |
---|---|---|
-Alvin Davis | 0.9979368675 | 0.0020631325 |
-Andre Dawson | 0.0007505105 | 0.9992494895 |
-Andres Galarraga | 0.0003391899 | 0.9996608101 |
-Alfredo Griffin | 0.9997526519 | 0.0002473481 |
-Andres Thomas | 0.0017857792 | 0.9982142208 |
-Andre Thornton | 0.9996359302 | 0.0003640698 |
-Bob Brenly | 0.0001754981 | 0.9998245019 |
-Bill Doran | 0.0009425210 | 0.9990574790 |
-Brian Downing | 0.9996047108 | 0.0003952892 |
-Bill Madlock | 0.0016847021 | 0.9983152979 |
-Chris Brown | 0.0029810078 | 0.9970189922 |
-Carlton Fisk | 0.9995762139 | 0.0004237861 |
-Carney Lansford | 0.9989394170 | 0.0010605830 |
-Darrell Evans | 0.9987166364 | 0.0012833636 |
-Damaso Garcia | 0.9995550027 | 0.0004449973 |
-Don Mattingly | 0.9991239245 | 0.0008760755 |
-Eric Davis | 0.0023883074 | 0.9976116926 |
-Eddie Milner | 0.0011534457 | 0.9988465543 |
-George Bell | 0.9997279118 | 0.0002720882 |
-Greg Brock | 0.0002475808 | 0.9997524192 |
- A
- N
- N
- A
- N
- A
- N
- N
- A
- N
- N
- A
- A
- A
- A
- A
- N
- N
- A
- N
- 'A'
- 'N'
30
#阈值设为0.9时有30个是满足的
贝叶斯
# # Naive Bayes分类
library ( e1071 )
bayes.fit <- naiveBayes(NewLeague~.,train.data )
bayes.fit
bayes.pred <- predict ( bayes.fit , test.data)
table ( bayes.pred , test.data$NewLeague , dnn = c ( "Prediction" , "Actual" ) )
mean ( bayes.pred == test.data$NewLeague )
Naive Bayes Classifier for Discrete Predictors
Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)
A-priori probabilities:
Y
A N
0.535545 0.464455
Conditional probabilities:
AtBat
Y [,1] [,2]
A 406.0885 150.5719
N 385.7041 148.2686
Hits
Y [,1] [,2]
A 109.1504 46.85849
N 102.1122 44.18538
HmRun
Y [,1] [,2]
A 12.592920 9.304029
N 9.479592 7.678098
Runs
Y [,1] [,2]
A 57.00000 26.96592
N 49.59184 23.34450
RBI
Y [,1] [,2]
A 52.77876 28.06553
N 46.96939 23.96581
Walks
Y [,1] [,2]
A 41.07080 23.26736
N 40.52041 19.11292
Years
Y [,1] [,2]
A 7.283186 4.676058
N 7.428571 5.058422
CAtBat
Y [,1] [,2]
A 2637.832 2198.567
N 2751.612 2542.893
CHits
Y [,1] [,2]
A 715.6018 619.5558
N 747.1224 726.1904
CHmRun
Y [,1] [,2]
A 72.73451 89.36422
N 61.76531 75.37614
CRuns
Y [,1] [,2]
A 367.2212 327.0564
N 358.2959 357.1627
CRBI
Y [,1] [,2]
A 332.5664 332.1341
N 330.0918 331.1957
CWalks
Y [,1] [,2]
A 258.3186 257.3786
N 263.3673 272.2331
League
Y A N
A 0.92035398 0.07964602
N 0.05102041 0.94897959
Division
Y E W
A 0.5221239 0.4778761
N 0.4795918 0.5204082
PutOuts
Y [,1] [,2]
A 256.9646 248.1815
N 303.2551 278.1822
Assists
Y [,1] [,2]
A 113.4425 139.3094
N 123.2143 146.8728
Errors
Y [,1] [,2]
A 8.362832 6.245087
N 8.846939 6.221599
Salary
Y [,1] [,2]
A 521.8844 464.9681
N 537.6504 448.5814
Actual
Prediction A N
A 27 4
N 1 20
0.903846153846154
library ( ROCR )
rocplot <- function ( pred , truth , ... ) {
predob <- prediction ( pred , truth )
perf <- performance ( predob , "tpr", "fpr" )
plot ( perf , ... )
auc <- performance ( predob , "auc" )
auc <- unlist ( slot ( auc , "y.values" ) )
auc <- round ( auc , 4 ) #保留4位小数
text ( x = 0.8 , y = 0.1 , labels = paste ( "AUC =" , auc ) )
}
# ROCR包用于ROC曲线。为了画出roc曲线,需要讲所有输出变成概率值,不用的概率值输出的方式不一样,代码如下
# ROC曲线的预测输出
bayes.pred2 <- predict ( bayes.fit , test.data , type = "raw") [ , 2] #bayes
par ( mfrow = c ( 2 , 2 ) )
y <- test.data$NewLeague
rocplot ( bayes.pred2 , y , main = "Bayes" )
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9rfP1zgd-1680423860920)(output_82_0.png)]