机器学习————线性回归篇

最新推荐文章于 2024-10-08 21:16:33 发布

sdz6966

最新推荐文章于 2024-10-08 21:16:33 发布

阅读量105

点赞数

分类专栏：机器学习文章标签：回归机器学习逻辑回归

本文链接：https://blog.csdn.net/misslink/article/details/129912700

版权

机器学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

数据预处理

library(ggplot2)
library(dplyr)
#导入包含数据的库
library("ISLR")

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

options (warn = -1)

#查看数据
data(Hitters)
#查看数据包含信息的具体含义，以便后续分析
?Hitters

fix(Hitters)
# 查看数据包含哪些内容
names(Hitters)

'AtBat'
'Hits'
'HmRun'
'Runs'
'RBI'
'Walks'
'Years'
'CAtBat'
'CHits'
'CHmRun'
'CRuns'
'CRBI'
'CWalks'
'League'
'Division'
'PutOuts'
'Assists'
'Errors'
'Salary'
'NewLeague'

#对数据进行预处理：删除存在空缺值的行
data<- na.omit(Hitters)
dim(data)

#绘制散点图
library(car)
scatterplotMatrix(data[c(12,3,4,5),c(1,2,3,4,5)],spread=FALSE,lty.smooth=2)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-2DDzPp4v-1680423860901)(output_7_0.png)]

一元线性回归

#调整数据删除非数值数据（设置一个空值）
fix(data)

#对所有数值型属性计算pearson相关系数,搜索工资与那个属性的线性相关性很高
cor_pea <- cor(data[,complete.cases(t(data))],method='pearson')
# cor_pea[17,]
cor_pea
heatmap(cor_pea)

A matrix: 17 × 17 of type dbl
	AtBat	Hits	HmRun	Runs	RBI	Walks	Years	CAtBat	CHits	CHmRun	CRuns	CRBI	CWalks	PutOuts	Assists	Errors	Salary
AtBat	1.0000000	0.96396913	0.555102154	0.89982910	0.79601539	0.6244481	0.01272550	0.207166254	0.22534146	0.21242155	0.23727777	0.22139318	0.13292568	0.30960746	0.342117377	0.325576978	0.394770945
Hits	0.9639691	1.00000000	0.530627358	0.91063014	0.78847819	0.5873105	0.01859809	0.206677608	0.23560577	0.18936425	0.23889610	0.21938423	0.12297073	0.29968754	0.303974950	0.279876183	0.438674738
HmRun	0.5551022	0.53062736	1.000000000	0.63107588	0.84910743	0.4404537	0.11348842	0.217463613	0.21749569	0.49252584	0.25834685	0.34985838	0.22718318	0.25093150	-0.161601753	-0.009743082	0.343028078
Runs	0.8998291	0.91063014	0.631075883	1.00000000	0.77869235	0.6970151	-0.01197495	0.171810798	0.19132697	0.22970104	0.23783121	0.20233548	0.16370021	0.27115986	0.179257859	0.192608787	0.419858559
RBI	0.7960154	0.78847819	0.849107434	0.77869235	1.00000000	0.5695048	0.12966795	0.278125914	0.29213714	0.44218969	0.30722616	0.38777657	0.23361884	0.31206456	0.062901737	0.150154692	0.449457088
Walks	0.6244481	0.58731051	0.440453717	0.69701510	0.56950476	1.0000000	0.13479270	0.269449974	0.27079505	0.34958216	0.33297657	0.31269680	0.42913990	0.28085548	0.102522559	0.081937197	0.443867260
Years	0.0127255	0.01859809	0.113488420	-0.01197495	0.12966795	0.1347927	1.00000000	0.915680692	0.89784449	0.72237071	0.87664855	0.86380936	0.83752373	-0.02001921	-0.085117725	-0.156511957	0.400656994
CAtBat	0.2071663	0.20667761	0.217463613	0.17181080	0.27812591	0.2694500	0.91568069	1.000000000	0.99505681	0.80167609	0.98274694	0.95073014	0.90671165	0.05339251	-0.007897271	-0.070477521	0.526135310
CHits	0.2253415	0.23560577	0.217495691	0.19132697	0.29213714	0.2707951	0.89784449	0.995056810	1.00000000	0.78665204	0.98454184	0.94679739	0.89071842	0.06734799	-0.013144204	-0.068035829	0.548909559
CHmRun	0.2124215	0.18936425	0.492525845	0.22970104	0.44218969	0.3495822	0.72237071	0.801676089	0.78665204	1.00000000	0.82562483	0.92790264	0.81087827	0.09382223	-0.188886464	-0.165369407	0.524930560
CRuns	0.2372778	0.23889610	0.258346846	0.23783121	0.30722616	0.3329766	0.87664855	0.982746941	0.98454184	0.82562483	1.00000000	0.94567701	0.92776846	0.05908718	-0.038895093	-0.094080542	0.562677711
CRBI	0.2213932	0.21938423	0.349858379	0.20233548	0.38777657	0.3126968	0.86380936	0.950730141	0.94679739	0.92790264	0.94567701	1.00000000	0.88913701	0.09537515	-0.096558877	-0.115316131	0.566965686
CWalks	0.1329257	0.12297073	0.227183183	0.16370021	0.23361884	0.4291399	0.83752373	0.906711655	0.89071842	0.81087827	0.92776846	0.88913701	1.00000000	0.05816016	-0.066243445	-0.129935875	0.489822036
PutOuts	0.3096075	0.29968754	0.250931497	0.27115986	0.31206456	0.2808555	-0.02001921	0.053392514	0.06734799	0.09382223	0.05908718	0.09537515	0.05816016	1.00000000	-0.043390143	0.075305857	0.300480356
Assists	0.3421174	0.30397495	-0.161601753	0.17925786	0.06290174	0.1025226	-0.08511772	-0.007897271	-0.01314420	-0.18888646	-0.03889509	-0.09655888	-0.06624345	-0.04339014	1.000000000	0.703504693	0.025436136
Errors	0.3255770	0.27987618	-0.009743082	0.19260879	0.15015469	0.0819372	-0.15651196	-0.070477521	-0.06803583	-0.16536941	-0.09408054	-0.11531613	-0.12993587	0.07530586	0.703504693	1.000000000	-0.005400702
Salary	0.3947709	0.43867474	0.343028078	0.41985856	0.44945709	0.4438673	0.40065699	0.526135310	0.54890956	0.52493056	0.56267771	0.56696569	0.48982204	0.30048036	0.025436136	-0.005400702	1.000000000

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1lLvlhNI-1680423860904)(output_10_1.png)]

通过计算pearson相关系数发现工资水平与其他指标的线性关系并不是很强。

一元线性回归（留出法）

#设置随机种子
set.seed(1) 
#划分训练集与测试集，从392个中选196个出来，这可以当做训练集
train=sample(dim(data)[1],dim(data)[1]*0.6) 
#经上述查询发现CAtBat与CHits的线性程度最高，所以对二者进行线性拟合,
fit.lm<-lm(CAtBat~CHits,data,subset=train)
fit.lm
# Residuals—残差统计量、intercept-表示截距、Estimate-包含由普通最小二乘法计算出来的估计回归系数、Std.error-估计的回归系数的标准误差、
# Multiple R-squared-拟合优度越大越好、F-statistic-判断方程的显著性检验
summary(fit.lm)

Call:
lm(formula = CAtBat ~ CHits, data = data, subset = train)

Coefficients:
(Intercept)        CHits  
    101.297        3.555  





Call:
lm(formula = CAtBat ~ CHits, data = data, subset = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-800.47  -75.06   -8.98   58.63  509.19 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 101.29721   21.94608   4.616 8.16e-06 ***
CHits         3.55539    0.02713 131.051  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 176.3 on 155 degrees of freedom
Multiple R-squared:  0.9911,	Adjusted R-squared:  0.991 
F-statistic: 1.717e+04 on 1 and 155 DF,  p-value: < 2.2e-16

options (warn = -1)
#绑定数据直接使用列名进行访问
attach(data)
#计算测试均方误差
mean((CAtBat-predict(fit.lm,data))[-train]^2)
# 重复运用验证集方法10次
err1=rep(0,10)
for ( i in 1 : 10 ) {
        train2 <- sample ( dim(data)[1] , dim(data)[1]*0.6 )
        lmfit2 <- lm ( CAtBat~CHits,data,subset=train2 )
        pred2 <- predict ( lmfit2 , Auto [ - train2 , ] )
        err1 [ i ] <- mean ( ( CAtBat [ - train2 ] - pred2 ) ^ 2 )
}
plot ( 1 : 10 , err1 , xlab = "" , type = "l" , main = "选取10个不同的训练集对应的测试误差" )

84470.5049254762

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-o9bUqzbi-1680423860905)(output_14_1.png)]

一元线性回归（交叉验证——留一法）

library(tidyverse)
#加载train函数的包
library(caret)

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.1     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mreadr    [39m 2.1.4     
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[31m✖[39m [34mcar[39m::[32mrecode()[39m   masks [34mdplyr[39m::recode()
[31m✖[39m [34mpurrr[39m::[32msome()[39m   masks [34mcar[39m::some()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: lattice


Attaching package: 'caret'


The following object is masked from 'package:purrr':

    lift

# data[,8:9]
#定义方法为留一法
train.control <- trainControl(method='LOOCV')
#训练模型，“lm”表示选用线性回归模型
model <- train(CAtBat~CHits,data,method='lm',trControl = train.control)
model$finalModel
par(mfrow=c(2,2))
plot(model$finalModel)

Call:
lm(formula = .outcome ~ ., data = dat)

Coefficients:
(Intercept)        CHits  
     122.56         3.51

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FAOGXJYT-1680423860906)(output_17_1.png)]

拟合优度R方很高，拟合效果较好。但对其进行线性回归的建设检验残差随机性较差。

一元线性回归（K折交叉验证）

#定义训练模型，设置随机种子，以k=10为例子
set.seed(123)
train.control <- trainControl(method ="cv",number=10)
#训练模型
model<- train(CAtBat~CHits,data,method="lm",trControl = train.control)
model$finalModel

par(mfrow=c(2,2))
plot(model$finalModel)

Call:
lm(formula = .outcome ~ ., data = dat)

Coefficients:
(Intercept)        CHits  
     122.56         3.51

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hZ6X0J6D-1680423860907)(output_20_1.png)]

10折交叉验证的方法得到的最终模型比留一法好一些（R方）

重复K折交叉验证

#定义训练模型，设置随机种子，以k=10为例子，重复10次
set.seed(123)
train.control <- trainControl(method ="repeatedcv",number=10,repeats=10)
#训练模型
model<- train(CAtBat~CHits,data,method="lm",trControl = train.control)
model$finalModel

par(mfrow=c(2,2))
plot(model$finalModel)

Call:
lm(formula = .outcome ~ ., data = dat)

Coefficients:
(Intercept)        CHits  
     122.56         3.51

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rsUP2NTY-1680423860907)(output_23_1.png)]

自助法

#定义训练模型，100次重采样
train.control <- trainControl(method ="boot",number=100)
#训练模型
model<- train(CAtBat~CHits,data,method="lm",trControl = train.control)
summary(model)
model$finalModel

par(mfrow=c(2,2))
plot(model$finalModel)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-1008.77   -98.49   -27.14    67.95   969.39 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 122.55929   21.02473   5.829 1.64e-08 ***
CHits         3.51015    0.02168 161.878  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 227.5 on 261 degrees of freedom
Multiple R-squared:  0.9901,	Adjusted R-squared:  0.9901 
F-statistic: 2.62e+04 on 1 and 261 DF,  p-value: < 2.2e-16





Call:
lm(formula = .outcome ~ ., data = dat)

Coefficients:
(Intercept)        CHits  
     122.56         3.51

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-r0lKGOam-1680423860908)(output_25_2.png)]

通过上述四种方法对模型进行评估，选择最优模型。

#回归诊断图的理解
#残差与拟合图，本质上残差服从正态分布与估计值无关的假设，与估计值无关，残差应该在y=0上下随机波动
#QQ图用来检测残差是否服从正态分布
#方差相同，红线应该是水平波动不可以存在上下波动
#检查是否存在特别极端的点cook的内部即可
#使用par()函数在同一窗口中创建多个图，mfrow： 决定了网格的行值和列

#绘制散点图
plot(data$CHits,data$CAtBat)
#添加拟合直线abline(a,b,h,v)a,b指定线的截距和斜率、h为水平线指定y、v为垂直线指定x
abline(model$finalModel)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rgqmwN8H-1680423860908)(output_28_0.png)]

# 预测
#构造要预测数据，newdata的类型必须是dataframe结构，
#而且必须是与原来的名称相同
newdata <- data.frame(CHits = seq(3000,3100, 1))
#predict.lm函数进行预测：predict(object,newdata，interval)object代表的是模型对象、interval代表的是置信区间的类型
#confidence是对均值做区间估计、prediction是对随机变量做区间预测
pred=predict(fit.lm, newdata, interval = "prediction")
pred

A matrix: 101 × 3 of type dbl
	fit	lwr	upr
1	10767.46	10395.64	11139.27
2	10771.01	10399.18	11142.85
3	10774.57	10402.72	11146.42
4	10778.12	10406.25	11150.00
5	10781.68	10409.79	11153.57
6	10785.23	10413.33	11157.14
7	10788.79	10416.86	11160.72
8	10792.35	10420.40	11164.29
9	10795.90	10423.94	11167.86
10	10799.46	10427.47	11171.44
11	10803.01	10431.01	11175.01
12	10806.57	10434.55	11178.59
13	10810.12	10438.09	11182.16
14	10813.68	10441.62	11185.73
15	10817.23	10445.16	11189.31
16	10820.79	10448.70	11192.88
17	10824.34	10452.23	11196.46
18	10827.90	10455.77	11200.03
19	10831.45	10459.31	11203.60
20	10835.01	10462.84	11207.18
21	10838.57	10466.38	11210.75
22	10842.12	10469.92	11214.33
23	10845.68	10473.45	11217.90
24	10849.23	10476.99	11221.47
25	10852.79	10480.53	11225.05
26	10856.34	10484.06	11228.62
27	10859.90	10487.60	11232.19
28	10863.45	10491.14	11235.77
29	10867.01	10494.68	11239.34
30	10870.56	10498.21	11242.92
⋮	⋮	⋮	⋮
72	11019.89	10646.75	11393.03
73	11023.45	10650.29	11396.60
74	11027.00	10653.83	11400.18
75	11030.56	10657.36	11403.75
76	11034.11	10660.90	11407.33
77	11037.67	10664.43	11410.90
78	11041.22	10667.97	11414.47
79	11044.78	10671.51	11418.05
80	11048.33	10675.04	11421.62
81	11051.89	10678.58	11425.20
82	11055.44	10682.12	11428.77
83	11059.00	10685.65	11432.35
84	11062.56	10689.19	11435.92
85	11066.11	10692.73	11439.49
86	11069.67	10696.26	11443.07
87	11073.22	10699.80	11446.64
88	11076.78	10703.34	11450.22
89	11080.33	10706.87	11453.79
90	11083.89	10710.41	11457.37
91	11087.44	10713.94	11460.94
92	11091.00	10717.48	11464.52
93	11094.55	10721.02	11468.09
94	11098.11	10724.55	11471.66
95	11101.66	10728.09	11475.24
96	11105.22	10731.63	11478.81
97	11108.78	10735.16	11482.39
98	11112.33	10738.70	11485.96
99	11115.89	10742.24	11489.54
100	11119.44	10745.77	11493.11
101	11123.00	10749.31	11496.68

回归诊断

library(car)
par(mfrow=c(2,2))
qqPlot(fit.lm,id.method='identify',simulate=TRUE,main="Q-Q plot")

residplot<-function(fit,nbreaks=10){
  z<-rstudent(fit)
  hist(z,breaks=nbreaks,freq=FALSE,
       xlab="Studnetized Residual",
       main="Distribution of Errors")
  rug(jitter(z),col="brown")
  curve(dnorm(x,mean=mean(z),sd=sd(z)),
        add=TRUE,col="blue",lwd=2)
  lines(density(z)$x,density(z)$y,
        col="red",lwd=2,lty=2)
  legend("topright",
         legend=c("Normal Curve","Kernel Density Curve"),
         lty=1:2,col=c("blue","red"),cex=0.7)}
residplot(fit.lm)
# Durbin-Watson检验的函数，可检测误差的序列相关性
durbinWatsonTest(fit.lm)

# 可通过成分残差图即偏残差图，判断因变量与自变量之间是否呈非线性关系，也可以看是否不同于已设定线性模型的系统偏差，图形可用car包中crPlots（）函数绘制
crPlots(fit.lm)

# VIF（Variance Inflation Factor，方差膨胀因子）进行检测
# 一般原则下，(VIF)^1/2 >2表明存在多重共线性问题
# vif(fit.lm)#此处只是一元线性回归所以没法用
# sqrt(vif(fit.lm))>2

outlierTest(fit.lm)

-Wade Boggs

124

-Dave Parker

136

 lag Autocorrelation D-W Statistic p-value
   1     -0.05576414      2.100299   0.528
 Alternative hypothesis: rho != 0



             rstudent unadjusted p-value Bonferroni p
-Wade Boggs -4.888515         2.5292e-06   0.00039709

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UTwwP8SS-1680423860909)(output_31_3.png)]

多项式回归

留出法

#设置随机种子
set.seed(1) 
#划分训练集与测试集，从392个中选196个出来，这可以当做训练集
train=sample(dim(data)[1],dim(data)[1]*0.6) 
#经上述查询发现CAtBat与CHits的线性程度最高，所以对二者进行线性拟合,
fit.lm<-lm(CAtBat~Years+I(Years^2),data,subset=train)
fit.lm
# Residuals—残差统计量、intercept-表示截距、Estimate-包含由普通最小二乘法计算出来的估计回归系数、Std.error-估计的回归系数的标准误差、
# Multiple R-squared-拟合优度越大越好、F-statistic-判断方程的显著性检验
summary(fit.lm)

Call:
lm(formula = CAtBat ~ Years + I(Years^2), data = data, subset = train)

Coefficients:
(Intercept)        Years   I(Years^2)  
    -174.98       350.94         2.27  





Call:
lm(formula = CAtBat ~ Years + I(Years^2), data = data, subset = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-2928.44  -393.39    41.81   561.16  2266.12 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -174.97     215.76  -0.811    0.419    
Years         350.94      60.95   5.758 4.48e-08 ***
I(Years^2)      2.27       3.54   0.641    0.522    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 878.6 on 154 degrees of freedom
Multiple R-squared:  0.7792,	Adjusted R-squared:  0.7763 
F-statistic: 271.7 on 2 and 154 DF,  p-value: < 2.2e-16

#绑定数据直接使用列名进行访问
attach(data)
#计算测试均方误差
mean((CAtBat-predict(fit.lm,data))[-train]^2)

# 重复运用验证集方法10次

err1=rep(0,10)
for ( i in 1 : 10 ) {
        train2 <- sample ( dim(data)[1] , dim(data)[1]*0.6 )
        lmfit2 <- lm ( CAtBat~CHits,data,subset=train2 )
        pred2 <- predict ( lmfit2 , Auto [ - train2 , ] )
        err1 [ i ] <- mean ( ( CAtBat [ - train2 ] - pred2 ) ^ 2 )
}
plot ( 1 : 10 , err1 , xlab = "" , type = "l" , 
       main = "选取10个不同的训练集对应的测试误差" )

The following objects are masked from data (pos = 13):

    Assists, AtBat, CAtBat, CHits, CHmRun, CRBI, CRuns, CWalks,
    Division, Errors, Hits, HmRun, League, NewLeague, PutOuts, RBI,
    Runs, Salary, Walks, Years

1034057.64239113

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vkwPYkFh-1680423860910)(output_35_2.png)]

交叉检验——留一法

#对数据进行预处理按照年份求取均值，在进行拟合
df<-data.frame(data[,c(7,8)])
df2<-aggregate(df$CAtBat,by=list(type=df$Years),mean)
library(tidyverse)
df2=rename(df2,c('Years'=type,'CAtBat'=x))

#定义方法为留一法
train.control <- trainControl(method='LOOCV')
#训练模型，“lm”表示选用线性回归模型
model <- train(CAtBat~Years+I(Years^2)+I(Years^3),df2,method='lm',trControl = train.control)
model
model$finalModel

plot(df2$Years,df2$CAtBat)
lines(df2$Years,fitted(model$finalModel))

par(mfrow=c(2,2))
plot(model$finalModel)

Linear Regression 

21 samples
 1 predictor

No pre-processing
Resampling: Leave-One-Out Cross-Validation 
Summary of sample sizes: 20, 20, 20, 20, 20, 20, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  607.3668  0.9706062  354.2228

Tuning parameter 'intercept' was held constant at a value of TRUE




Call:
lm(formula = .outcome ~ ., data = dat)

Coefficients:
 (Intercept)         Years  `I(Years^2)`  `I(Years^3)`  
    -589.818       591.534       -29.969         1.262

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-twPjDGjQ-1680423860910)(output_38_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MM409tU4-1680423860911)(output_38_3.png)]

尝试后，可能三次较好

多元线性回归：高阶的、交互项的

逐步回归

lm.fit=lm(Salary~.,data)
#逐步回归
lm.step1=step(lm.fit)

Start:  AIC=3035.44
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + 
    CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League + 
    Division + PutOuts + Assists + Errors + NewLeague

            Df Sum of Sq      RSS    AIC
- CHmRun     1       973 24187888 3033.4
- CHits      1      4570 24191486 3033.5
- NewLeague  1     10330 24197245 3033.6
- Years      1     10569 24197485 3033.6
- RBI        1     15126 24202042 3033.6
- HmRun      1     47407 24234323 3033.9
- Runs       1     58931 24245847 3034.1
- Errors     1     61915 24248830 3034.1
- League     1     62043 24248959 3034.1
- CRBI       1    133884 24320800 3034.9
- CAtBat     1    163910 24350826 3035.2
<none>                   24186916 3035.4
- Assists    1    288343 24475259 3036.6
- CRuns      1    377536 24564451 3037.5
- CWalks     1    613995 24800910 3040.0
- Division   1    842840 25029755 3042.4
- AtBat      1    969060 25155975 3043.7
- Hits       1    970685 25157601 3043.8
- Walks      1   1146914 25333830 3045.6
- PutOuts    1   1294459 25481375 3047.1

Step:  AIC=3033.45
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + 
    CAtBat + CHits + CRuns + CRBI + CWalks + League + Division + 
    PutOuts + Assists + Errors + NewLeague

            Df Sum of Sq      RSS    AIC
- Years      1     10311 24198200 3031.6
- NewLeague  1     10797 24198686 3031.6
- RBI        1     14186 24202074 3031.6
- CHits      1     15061 24202949 3031.6
- HmRun      1     52012 24239900 3032.0
- Runs       1     60645 24248533 3032.1
- Errors     1     63117 24251005 3032.1
- League     1     63247 24251136 3032.1
- CAtBat     1    178916 24366805 3033.4
<none>                   24187888 3033.4
- Assists    1    293831 24481719 3034.6
- CRuns      1    619537 24807425 3038.1
- CWalks     1    651484 24839372 3038.4
- Division   1    843039 25030928 3040.4
- CRBI       1    865749 25053637 3040.7
- AtBat      1    968661 25156549 3041.7
- Hits       1   1006060 25193949 3042.1
- Walks      1   1158436 25346324 3043.7
- PutOuts    1   1299490 25487379 3045.2

Step:  AIC=3031.56
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat + 
    CHits + CRuns + CRBI + CWalks + League + Division + PutOuts + 
    Assists + Errors + NewLeague

            Df Sum of Sq      RSS    AIC
- NewLeague  1     10341 24208540 3029.7
- RBI        1     15435 24213635 3029.7
- CHits      1     19987 24218187 3029.8
- HmRun      1     53660 24251860 3030.1
- Runs       1     59127 24257327 3030.2
- Errors     1     60105 24258304 3030.2
- League     1     65925 24264125 3030.3
<none>                   24198200 3031.6
- CAtBat     1    275224 24473424 3032.5
- Assists    1    301166 24499366 3032.8
- CWalks     1    654794 24852994 3036.6
- CRuns      1    655377 24853577 3036.6
- Division   1    833693 25031893 3038.4
- CRBI       1    874410 25072610 3038.9
- AtBat      1    961565 25159764 3039.8
- Hits       1    996477 25194677 3040.1
- Walks      1   1155832 25354031 3041.8
- PutOuts    1   1312785 25510984 3043.4

Step:  AIC=3029.67
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat + 
    CHits + CRuns + CRBI + CWalks + League + Division + PutOuts + 
    Assists + Errors

           Df Sum of Sq      RSS    AIC
- RBI       1     15256 24223796 3027.8
- CHits     1     17373 24225913 3027.9
- HmRun     1     54476 24263016 3028.3
- Errors    1     57130 24265671 3028.3
- Runs      1     58326 24266866 3028.3
- League    1    105730 24314271 3028.8
<none>                  24208540 3029.7
- CAtBat    1    270031 24478572 3030.6
- Assists   1    303082 24511622 3030.9
- CWalks    1    654258 24862798 3034.7
- CRuns     1    670419 24878960 3034.8
- Division  1    830667 25039208 3036.5
- CRBI      1    882812 25091352 3037.1
- AtBat     1    992385 25200926 3038.2
- Hits      1   1013020 25221560 3038.4
- Walks     1   1148399 25356939 3039.8
- PutOuts   1   1313746 25522287 3041.5

Step:  AIC=3027.84
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CHits + 
    CRuns + CRBI + CWalks + League + Division + PutOuts + Assists + 
    Errors

           Df Sum of Sq      RSS    AIC
- CHits     1     14968 24238763 3026.0
- HmRun     1     44993 24268789 3026.3
- Runs      1     50436 24274232 3026.4
- Errors    1     60377 24284173 3026.5
- League    1    106822 24330618 3027.0
<none>                  24223796 3027.8
- CAtBat    1    261759 24485555 3028.7
- Assists   1    302461 24526257 3029.1
- CWalks    1    645358 24869154 3032.7
- CRuns     1    702809 24926605 3033.3
- Division  1    815566 25039362 3034.5
- CRBI      1    897559 25121355 3035.4
- Hits      1   1014099 25237895 3036.6
- AtBat     1   1035085 25258881 3036.8
- Walks     1   1137010 25360806 3037.9
- PutOuts   1   1318690 25542486 3039.7

Step:  AIC=3026
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CRuns + 
    CRBI + CWalks + League + Division + PutOuts + Assists + Errors

           Df Sum of Sq      RSS    AIC
- HmRun     1     40626 24279390 3024.4
- Errors    1     54053 24292816 3024.6
- Runs      1     76176 24314940 3024.8
- League    1    113298 24352062 3025.2
<none>                  24238763 3026.0
- Assists   1    290031 24528795 3027.1
- CAtBat    1    623235 24861999 3030.7
- Division  1    807193 25045957 3032.6
- CRBI      1    907050 25145813 3033.6
- CWalks    1   1021560 25260323 3034.8
- Walks     1   1241594 25480357 3037.1
- AtBat     1   1337687 25576451 3038.1
- PutOuts   1   1387677 25626441 3038.6
- CRuns     1   1395747 25634510 3038.7
- Hits      1   1592616 25831379 3040.7

Step:  AIC=3024.44
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI + 
    CWalks + League + Division + PutOuts + Assists + Errors

           Df Sum of Sq      RSS    AIC
- Errors    1     46032 24325422 3022.9
- Runs      1     46359 24325749 3022.9
- League    1    102449 24381839 3023.5
<none>                  24279390 3024.4
- Assists   1    253322 24532712 3025.2
- CAtBat    1    662504 24941894 3029.5
- Division  1    801292 25080681 3030.9
- CWalks    1    992348 25271738 3032.9
- Walks     1   1201134 25480524 3035.1
- AtBat     1   1298852 25578242 3036.1
- CRuns     1   1356074 25635464 3036.7
- CRBI      1   1358716 25638106 3036.7
- PutOuts   1   1410607 25689996 3037.2
- Hits      1   1558262 25837652 3038.7

Step:  AIC=3022.94
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI + 
    CWalks + League + Division + PutOuts + Assists

           Df Sum of Sq      RSS    AIC
- Runs      1     51651 24377074 3021.5
- League    1     89806 24415228 3021.9
<none>                  24325422 3022.9
- Assists   1    224409 24549831 3023.3
- CAtBat    1    658222 24983644 3027.9
- Division  1    804432 25129854 3029.5
- CWalks    1    978988 25304410 3031.3
- Walks     1   1235957 25561379 3033.9
- CRBI      1   1335850 25661272 3034.9
- CRuns     1   1362206 25687628 3035.2
- PutOuts   1   1372903 25698325 3035.3
- AtBat     1   1377934 25703357 3035.4
- Hits      1   1636145 25961567 3038.0

Step:  AIC=3021.49
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks + 
    League + Division + PutOuts + Assists

           Df Sum of Sq      RSS    AIC
- League    1    110612 24487686 3020.7
<none>                  24377074 3021.5
- Assists   1    285597 24662671 3022.5
- CAtBat    1    606762 24983836 3025.9
- Division  1    786536 25163610 3027.8
- CWalks    1    956574 25333648 3029.6
- Walks     1   1213748 25590821 3032.2
- CRuns     1   1334789 25711863 3033.5
- CRBI      1   1365809 25742883 3033.8
- PutOuts   1   1431602 25808675 3034.4
- AtBat     1   1519201 25896274 3035.3
- Hits      1   1712234 26089308 3037.3

Step:  AIC=3020.68
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks + 
    Division + PutOuts + Assists

           Df Sum of Sq      RSS    AIC
<none>                  24487686 3020.7
- Assists   1    319213 24806899 3022.1
- CAtBat    1    546871 25034557 3024.5
- Division  1    805422 25293108 3027.2
- CWalks    1    977622 25465308 3028.9
- CRuns     1   1270026 25757712 3031.9
- Walks     1   1290266 25777951 3032.1
- CRBI      1   1332094 25819779 3032.6
- PutOuts   1   1523160 26010846 3034.5
- AtBat     1   1584954 26072640 3035.1
- Hits      1   1708829 26196515 3036.3

#逐步回归
lm.step2=step(lm.fit)
#统计最好的模型
summary(lm.step2)

Start:  AIC=3035.44
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + 
    CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + League + 
    Division + PutOuts + Assists + Errors + NewLeague

            Df Sum of Sq      RSS    AIC
- CHmRun     1       973 24187888 3033.4
- CHits      1      4570 24191486 3033.5
- NewLeague  1     10330 24197245 3033.6
- Years      1     10569 24197485 3033.6
- RBI        1     15126 24202042 3033.6
- HmRun      1     47407 24234323 3033.9
- Runs       1     58931 24245847 3034.1
- Errors     1     61915 24248830 3034.1
- League     1     62043 24248959 3034.1
- CRBI       1    133884 24320800 3034.9
- CAtBat     1    163910 24350826 3035.2
<none>                   24186916 3035.4
- Assists    1    288343 24475259 3036.6
- CRuns      1    377536 24564451 3037.5
- CWalks     1    613995 24800910 3040.0
- Division   1    842840 25029755 3042.4
- AtBat      1    969060 25155975 3043.7
- Hits       1    970685 25157601 3043.8
- Walks      1   1146914 25333830 3045.6
- PutOuts    1   1294459 25481375 3047.1

Step:  AIC=3033.45
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + 
    CAtBat + CHits + CRuns + CRBI + CWalks + League + Division + 
    PutOuts + Assists + Errors + NewLeague

            Df Sum of Sq      RSS    AIC
- Years      1     10311 24198200 3031.6
- NewLeague  1     10797 24198686 3031.6
- RBI        1     14186 24202074 3031.6
- CHits      1     15061 24202949 3031.6
- HmRun      1     52012 24239900 3032.0
- Runs       1     60645 24248533 3032.1
- Errors     1     63117 24251005 3032.1
- League     1     63247 24251136 3032.1
- CAtBat     1    178916 24366805 3033.4
<none>                   24187888 3033.4
- Assists    1    293831 24481719 3034.6
- CRuns      1    619537 24807425 3038.1
- CWalks     1    651484 24839372 3038.4
- Division   1    843039 25030928 3040.4
- CRBI       1    865749 25053637 3040.7
- AtBat      1    968661 25156549 3041.7
- Hits       1   1006060 25193949 3042.1
- Walks      1   1158436 25346324 3043.7
- PutOuts    1   1299490 25487379 3045.2

Step:  AIC=3031.56
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat + 
    CHits + CRuns + CRBI + CWalks + League + Division + PutOuts + 
    Assists + Errors + NewLeague

            Df Sum of Sq      RSS    AIC
- NewLeague  1     10341 24208540 3029.7
- RBI        1     15435 24213635 3029.7
- CHits      1     19987 24218187 3029.8
- HmRun      1     53660 24251860 3030.1
- Runs       1     59127 24257327 3030.2
- Errors     1     60105 24258304 3030.2
- League     1     65925 24264125 3030.3
<none>                   24198200 3031.6
- CAtBat     1    275224 24473424 3032.5
- Assists    1    301166 24499366 3032.8
- CWalks     1    654794 24852994 3036.6
- CRuns      1    655377 24853577 3036.6
- Division   1    833693 25031893 3038.4
- CRBI       1    874410 25072610 3038.9
- AtBat      1    961565 25159764 3039.8
- Hits       1    996477 25194677 3040.1
- Walks      1   1155832 25354031 3041.8
- PutOuts    1   1312785 25510984 3043.4

Step:  AIC=3029.67
Salary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + CAtBat + 
    CHits + CRuns + CRBI + CWalks + League + Division + PutOuts + 
    Assists + Errors

           Df Sum of Sq      RSS    AIC
- RBI       1     15256 24223796 3027.8
- CHits     1     17373 24225913 3027.9
- HmRun     1     54476 24263016 3028.3
- Errors    1     57130 24265671 3028.3
- Runs      1     58326 24266866 3028.3
- League    1    105730 24314271 3028.8
<none>                  24208540 3029.7
- CAtBat    1    270031 24478572 3030.6
- Assists   1    303082 24511622 3030.9
- CWalks    1    654258 24862798 3034.7
- CRuns     1    670419 24878960 3034.8
- Division  1    830667 25039208 3036.5
- CRBI      1    882812 25091352 3037.1
- AtBat     1    992385 25200926 3038.2
- Hits      1   1013020 25221560 3038.4
- Walks     1   1148399 25356939 3039.8
- PutOuts   1   1313746 25522287 3041.5

Step:  AIC=3027.84
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CHits + 
    CRuns + CRBI + CWalks + League + Division + PutOuts + Assists + 
    Errors

           Df Sum of Sq      RSS    AIC
- CHits     1     14968 24238763 3026.0
- HmRun     1     44993 24268789 3026.3
- Runs      1     50436 24274232 3026.4
- Errors    1     60377 24284173 3026.5
- League    1    106822 24330618 3027.0
<none>                  24223796 3027.8
- CAtBat    1    261759 24485555 3028.7
- Assists   1    302461 24526257 3029.1
- CWalks    1    645358 24869154 3032.7
- CRuns     1    702809 24926605 3033.3
- Division  1    815566 25039362 3034.5
- CRBI      1    897559 25121355 3035.4
- Hits      1   1014099 25237895 3036.6
- AtBat     1   1035085 25258881 3036.8
- Walks     1   1137010 25360806 3037.9
- PutOuts   1   1318690 25542486 3039.7

Step:  AIC=3026
Salary ~ AtBat + Hits + HmRun + Runs + Walks + CAtBat + CRuns + 
    CRBI + CWalks + League + Division + PutOuts + Assists + Errors

           Df Sum of Sq      RSS    AIC
- HmRun     1     40626 24279390 3024.4
- Errors    1     54053 24292816 3024.6
- Runs      1     76176 24314940 3024.8
- League    1    113298 24352062 3025.2
<none>                  24238763 3026.0
- Assists   1    290031 24528795 3027.1
- CAtBat    1    623235 24861999 3030.7
- Division  1    807193 25045957 3032.6
- CRBI      1    907050 25145813 3033.6
- CWalks    1   1021560 25260323 3034.8
- Walks     1   1241594 25480357 3037.1
- AtBat     1   1337687 25576451 3038.1
- PutOuts   1   1387677 25626441 3038.6
- CRuns     1   1395747 25634510 3038.7
- Hits      1   1592616 25831379 3040.7

Step:  AIC=3024.44
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI + 
    CWalks + League + Division + PutOuts + Assists + Errors

           Df Sum of Sq      RSS    AIC
- Errors    1     46032 24325422 3022.9
- Runs      1     46359 24325749 3022.9
- League    1    102449 24381839 3023.5
<none>                  24279390 3024.4
- Assists   1    253322 24532712 3025.2
- CAtBat    1    662504 24941894 3029.5
- Division  1    801292 25080681 3030.9
- CWalks    1    992348 25271738 3032.9
- Walks     1   1201134 25480524 3035.1
- AtBat     1   1298852 25578242 3036.1
- CRuns     1   1356074 25635464 3036.7
- CRBI      1   1358716 25638106 3036.7
- PutOuts   1   1410607 25689996 3037.2
- Hits      1   1558262 25837652 3038.7

Step:  AIC=3022.94
Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI + 
    CWalks + League + Division + PutOuts + Assists

           Df Sum of Sq      RSS    AIC
- Runs      1     51651 24377074 3021.5
- League    1     89806 24415228 3021.9
<none>                  24325422 3022.9
- Assists   1    224409 24549831 3023.3
- CAtBat    1    658222 24983644 3027.9
- Division  1    804432 25129854 3029.5
- CWalks    1    978988 25304410 3031.3
- Walks     1   1235957 25561379 3033.9
- CRBI      1   1335850 25661272 3034.9
- CRuns     1   1362206 25687628 3035.2
- PutOuts   1   1372903 25698325 3035.3
- AtBat     1   1377934 25703357 3035.4
- Hits      1   1636145 25961567 3038.0

Step:  AIC=3021.49
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks + 
    League + Division + PutOuts + Assists

           Df Sum of Sq      RSS    AIC
- League    1    110612 24487686 3020.7
<none>                  24377074 3021.5
- Assists   1    285597 24662671 3022.5
- CAtBat    1    606762 24983836 3025.9
- Division  1    786536 25163610 3027.8
- CWalks    1    956574 25333648 3029.6
- Walks     1   1213748 25590821 3032.2
- CRuns     1   1334789 25711863 3033.5
- CRBI      1   1365809 25742883 3033.8
- PutOuts   1   1431602 25808675 3034.4
- AtBat     1   1519201 25896274 3035.3
- Hits      1   1712234 26089308 3037.3

Step:  AIC=3020.68
Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks + 
    Division + PutOuts + Assists

           Df Sum of Sq      RSS    AIC
<none>                  24487686 3020.7
- Assists   1    319213 24806899 3022.1
- CAtBat    1    546871 25034557 3024.5
- Division  1    805422 25293108 3027.2
- CWalks    1    977622 25465308 3028.9
- CRuns     1   1270026 25757712 3031.9
- Walks     1   1290266 25777951 3032.1
- CRBI      1   1332094 25819779 3032.6
- PutOuts   1   1523160 26010846 3034.5
- AtBat     1   1584954 26072640 3035.1
- Hits      1   1708829 26196515 3036.3




Call:
lm(formula = Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + 
    CRBI + CWalks + Division + PutOuts + Assists, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-940.62 -177.71  -33.17  134.08 1909.31 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  163.24108   67.05209   2.435 0.015608 *  
AtBat         -2.16562    0.53729  -4.031 7.38e-05 ***
Hits           6.90505    1.64989   4.185 3.94e-05 ***
Walks          5.77344    1.58757   3.637 0.000335 ***
CAtBat        -0.13315    0.05624  -2.368 0.018664 *  
CRuns          1.43365    0.39735   3.608 0.000372 ***
CRBI           0.77607    0.21003   3.695 0.000270 ***
CWalks        -0.83874    0.26496  -3.166 0.001739 ** 
DivisionW   -112.96345   39.31545  -2.873 0.004410 ** 
PutOuts        0.29542    0.07477   3.951 0.000101 ***
Assists        0.28603    0.15813   1.809 0.071670 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 312.3 on 251 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.5407,	Adjusted R-squared:  0.5224 
F-statistic: 29.55 on 10 and 251 DF,  p-value: < 2.2e-16

留出法

#通过如下命令查看工资水平到底和那个有关系
summary(lm(Salary~.,data))

Call:
lm(formula = Salary ~ ., data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-908.37 -178.95  -31.67  141.08 1874.33 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  167.10276   91.57537   1.825 0.069271 .  
AtBat         -1.97769    0.63513  -3.114 0.002069 ** 
Hits           7.44005    2.38737   3.116 0.002052 ** 
HmRun          4.27968    6.21401   0.689 0.491662    
Runs          -2.29854    2.99337  -0.768 0.443310    
RBI           -1.01413    2.60683  -0.389 0.697598    
Walks          6.20858    1.83278   3.388 0.000823 ***
Years         -4.07600   12.53431  -0.325 0.745320    
CAtBat        -0.17368    0.13562  -1.281 0.201553    
CHits          0.14463    0.67636   0.214 0.830857    
CHmRun        -0.15985    1.62049  -0.099 0.921502    
CRuns          1.46166    0.75205   1.944 0.053109 .  
CRBI           0.80318    0.69396   1.157 0.248251    
CWalks        -0.81493    0.32879  -2.479 0.013874 *  
LeagueN       62.56007   79.40244   0.788 0.431534    
DivisionW   -117.56598   40.48478  -2.904 0.004025 ** 
PutOuts        0.27988    0.07777   3.599 0.000388 ***
Assists        0.37754    0.22228   1.699 0.090694 .  
Errors        -3.47048    4.40935  -0.787 0.432010    
NewLeagueN   -25.45031   79.16482  -0.321 0.748120    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 316.1 on 242 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.5463,	Adjusted R-squared:  0.5107 
F-statistic: 15.34 on 19 and 242 DF,  p-value: < 2.2e-16

#设置随机种子
set.seed(1) 
#划分训练集与测试集，从392个中选196个出来，这可以当做训练集
train=sample(dim(data)[1],dim(data)[1]*0.6) 
#经上述查询发现CAtBat与CHits的线性程度最高，所以对二者进行线性拟合,
fit.lm<-lm(Salary~AtBat+Hits+Walks+PutOuts,data,subset=train)
fit.lm
# Residuals—残差统计量、intercept-表示截距、Estimate-包含由普通最小二乘法计算出来的估计回归系数、Std.error-估计的回归系数的标准误差、
# Multiple R-squared-拟合优度越大越好、F-statistic-判断方程的显著性检验
summary(fit.lm)

Call:
lm(formula = Salary ~ AtBat + Hits + Walks + PutOuts, data = data, 
    subset = train)

Coefficients:
(Intercept)        AtBat         Hits        Walks      PutOuts  
   127.3390      -1.9234       7.1114       8.0472       0.2953  





Call:
lm(formula = Salary ~ AtBat + Hits + Walks + PutOuts, data = data, 
    subset = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-896.36 -217.42  -64.84  176.82 2008.31 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 127.3390    93.2714   1.365  0.17419    
AtBat        -1.9234     0.8224  -2.339  0.02065 *  
Hits          7.1114     2.6684   2.665  0.00853 ** 
Walks         8.0472     1.9202   4.191  4.7e-05 ***
PutOuts       0.2953     0.1192   2.479  0.01429 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 381 on 152 degrees of freedom
Multiple R-squared:  0.3069,	Adjusted R-squared:  0.2887 
F-statistic: 16.83 on 4 and 152 DF,  p-value: 1.935e-11

#绑定数据直接使用列名进行访问
attach(data)
#计算测试均方误差
mean((Salary-predict(fit.lm,data))[-train]^2)

# 重复运用验证集方法10次
err1=rep(0,10)
for ( i in 1 : 10 ) {
        train2 <- sample ( dim(data)[1] , dim(data)[1]*0.6 )
        lmfit2 <- lm (Salary~AtBat+Hits+Walks+PutOuts,data,subset=train2 )
        pred2 <- predict ( lmfit2 , Auto [ - train2 , ] )
        err1 [ i ] <- mean ( ( Salary [ - train2 ] - pred2 ) ^ 2 )
}
plot ( 1 : 10 , err1 , xlab = "" , type = "l" , main = "选取10个不同的训练集对应的测试误差" )

The following objects are masked from data (pos = 3):

    Assists, AtBat, CAtBat, CHits, CHmRun, CRBI, CRuns, CWalks,
    Division, Errors, Hits, HmRun, League, NewLeague, PutOuts, RBI,
    Runs, Salary, Walks, Years


The following objects are masked from data (pos = 14):

    Assists, AtBat, CAtBat, CHits, CHmRun, CRBI, CRuns, CWalks,
    Division, Errors, Hits, HmRun, League, NewLeague, PutOuts, RBI,
    Runs, Salary, Walks, Years

147498.365090483

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GVEJSp5m-1680423860913)(output_47_2.png)]

留一法

#定义方法为留一法
train.control <- trainControl(method='LOOCV')
#训练模型，“lm”表示选用线性回归模型
model <- train(Salary~AtBat+Hits+Walks+PutOuts,data,method='lm',trControl = train.control)
model
model$finalModel

par(mfrow=c(2,2))
plot(model$finalModel)

Linear Regression 

263 samples
  4 predictor

No pre-processing
Resampling: Leave-One-Out Cross-Validation 
Summary of sample sizes: 262, 262, 262, 262, 262, 262, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  387.4613  0.2607978  278.2137

Tuning parameter 'intercept' was held constant at a value of TRUE




Call:
lm(formula = .outcome ~ ., data = dat)

Coefficients:
(Intercept)        AtBat         Hits        Walks      PutOuts  
   107.9885      -2.1739       8.8471       6.6693       0.2657

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7nA34cEq-1680423860913)(output_49_2.png)]

留一法的拟合优度R方很小，比留出法还差

K折交叉检验

#定义训练模型，设置随机种子，以k=10为例子，重复10次
set.seed(123)
train.control <- trainControl(method ="cv",number=10)
#训练模型
model<- train(Salary~AtBat+Hits+Walks+PutOuts,data,method="lm",trControl = train.control)
summary(model$finalModel)

par(mfrow=c(2,2))
plot(model$finalModel)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-990.80 -228.07  -64.33  176.96 2033.25 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 107.9885    72.2842   1.494 0.136413    
AtBat        -2.1739     0.6236  -3.486 0.000576 ***
Hits          8.8471     1.9625   4.508 9.93e-06 ***
Walks         6.6693     1.3985   4.769 3.10e-06 ***
PutOuts       0.2657     0.0889   2.988 0.003074 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 380.3 on 258 degrees of freedom
Multiple R-squared:    0.3,	Adjusted R-squared:  0.2892 
F-statistic: 27.65 on 4 and 258 DF,  p-value: < 2.2e-16

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ciQSgOik-1680423860914)(output_52_1.png)]

重复K次交叉验证

#定义训练模型，设置随机种子，以k=10为例子，重复10次
set.seed(123)
# train.control <- trainControl(method ="repeatedcv",number=10，repeats=3)
train.control <- trainControl(method = "repeatedcv", 
                              number = 10, repeats = 3)
#训练模型
model<- train(Salary~AtBat+Hits+Walks+PutOuts,data,method="lm",trControl = train.control)
summary(model$finalModel)

#提取系数的置信区间
confint(model$finalModel)

par(mfrow=c(2,2))
plot(model$finalModel)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-990.80 -228.07  -64.33  176.96 2033.25 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 107.9885    72.2842   1.494 0.136413    
AtBat        -2.1739     0.6236  -3.486 0.000576 ***
Hits          8.8471     1.9625   4.508 9.93e-06 ***
Walks         6.6693     1.3985   4.769 3.10e-06 ***
PutOuts       0.2657     0.0889   2.988 0.003074 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 380.3 on 258 degrees of freedom
Multiple R-squared:    0.3,	Adjusted R-squared:  0.2892 
F-statistic: 27.65 on 4 and 258 DF,  p-value: < 2.2e-16

A matrix: 5 × 2 of type dbl
	2.5 %	97.5 %
(Intercept)	-34.35369004	250.3306400
AtBat	-3.40188946	-0.9459061
Hits	4.98255201	12.7117007
Walks	3.91540097	9.4232171
PutOuts	0.09061403	0.4407509

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GKFmdmpj-1680423860914)(output_54_2.png)]

K折交叉检验虽然对拟合优度有所提升但是，拟合效果依旧很差

#带交互项的回归
#注意：变量x和y在lm拟合中，x*y和x：y的意思是不一样的。
summary(lm(Salary~AtBat*PutOuts,data)) #单变量+交互
par(mfrow=c(2,2))
plot(lm(Salary~AtBat*PutOuts,data))

Call:
lm(formula = Salary ~ AtBat * PutOuts, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-923.42 -248.32  -51.22  182.19 2002.85 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    1.070e+02  1.096e+02   0.976  0.33001    
AtBat          8.524e-01  2.554e-01   3.337  0.00097 ***
PutOuts       -7.263e-03  3.603e-01  -0.020  0.98393    
AtBat:PutOuts  6.691e-04  7.158e-04   0.935  0.35077    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 407.4 on 259 degrees of freedom
Multiple R-squared:  0.1937,	Adjusted R-squared:  0.1844 
F-statistic: 20.74 on 3 and 259 DF,  p-value: 4.476e-12

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iWMoB9TA-1680423860915)(output_56_1.png)]

广义线性回归

对数回归

#对salary取对数进行回归
fit.lm=lm(log(Salary)~Walks,data)
summary(fit.lm)

Call:
lm(formula = log(Salary) ~ Walks, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8512 -0.6127  0.1142  0.6251  2.4633 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5.199290   0.106206  48.955  < 2e-16 ***
Walks       0.017705   0.002285   7.748 2.07e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.8033 on 261 degrees of freedom
Multiple R-squared:  0.187,	Adjusted R-squared:  0.1839 
F-statistic: 60.03 on 1 and 261 DF,  p-value: 2.075e-13

Logistic回归

留出法

#将数据分成训练集和测试集:%>%将数据进行传递、
set.seed(123)
training.samples<- Hitters$NewLeague %>%
    createDataPartition(p = 0.8, list = FALSE)
train.data <- Hitters[training.samples,]
test.data <- Hitters[-training.samples,]

glm.fits_1<-glm(NewLeague~.,train.data,family=binomial(link='logit'))
summary(glm.fits_1)

glm.probs=predict(glm.fits_1,test.data,type="response")
#查看到底哪个是0，logistic有一个默认的规则
contrasts(train.data$NewLeague)
#创建预测结果存储的地方，rep函数进行复制,
glm.pred=rep("A",dim(test.data)[1])
#全部的分类结果，设置了阈值为0.5
glm.pred[glm.probs>.5]='N'
#混淆矩阵
table(glm.pred,test.data$NewLeague)
#计算正确率
mean(glm.pred==test.data$NewLeague)

Call:
glm(formula = NewLeague ~ ., family = binomial(link = "logit"), 
    data = train.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3871  -0.2839  -0.1428   0.3762   3.1521  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -4.1111835  1.3979353  -2.941  0.00327 ** 
AtBat        0.0139516  0.0099547   1.402  0.16106    
Hits        -0.0238494  0.0385859  -0.618  0.53652    
HmRun       -0.0230690  0.0950296  -0.243  0.80819    
Runs        -0.0422197  0.0462859  -0.912  0.36169    
RBI          0.0088451  0.0384175   0.230  0.81791    
Walks        0.0303770  0.0303220   1.002  0.31643    
Years       -0.2004386  0.1811425  -1.107  0.26850    
CAtBat      -0.0018551  0.0023725  -0.782  0.43427    
CHits        0.0156182  0.0118849   1.314  0.18881    
CHmRun       0.0062912  0.0257702   0.244  0.80713    
CRuns       -0.0118422  0.0101635  -1.165  0.24395    
CRBI        -0.0063914  0.0115327  -0.554  0.57945    
CWalks       0.0030614  0.0056898   0.538  0.59055    
LeagueN      5.7242274  0.7498249   7.634 2.27e-14 ***
DivisionW    0.0555476  0.6396310   0.087  0.93080    
PutOuts     -0.0004156  0.0011629  -0.357  0.72080    
Assists     -0.0033126  0.0036206  -0.915  0.36023    
Errors      -0.0074453  0.0649022  -0.115  0.90867    
Salary      -0.0007739  0.0009779  -0.791  0.42875    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 281.214  on 203  degrees of freedom
Residual deviance:  91.518  on 184  degrees of freedom
  (54 observations deleted due to missingness)
AIC: 131.52

Number of Fisher Scoring iterations: 6

A matrix: 2 × 1 of type dbl
	N
A	0
N	1

glm.pred  A  N
       A 34  3
       N  1 26

0.9375

正确率达到0.9375，

glm.fits_1<-glm(NewLeague~.,train.data,family=binomial(link='probit'))
summary(glm.fits_1)

glm.probs=predict(glm.fits_1,test.data,type="response")
#查看到底哪个是0，logistic有一个默认的规则
contrasts(train.data$NewLeague)
#创建预测结果存储的地方，rep函数进行复制,
glm.pred=rep("A",dim(test.data)[1])
#全部的分类结果
glm.pred[glm.probs>.5]='N'
#混淆矩阵
table(glm.pred,test.data$NewLeague)
#计算正确率
mean(glm.pred==test.data$NewLeague)

Call:
glm(formula = NewLeague ~ ., family = binomial(link = "probit"), 
    data = train.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3198  -0.2862  -0.1204   0.3753   2.9415  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.0228580  0.6800483  -2.975  0.00293 ** 
AtBat        0.0055448  0.0048246   1.149  0.25044    
Hits        -0.0083577  0.0188149  -0.444  0.65689    
HmRun       -0.0018123  0.0464151  -0.039  0.96885    
Runs        -0.0223190  0.0228722  -0.976  0.32916    
RBI          0.0006059  0.0191925   0.032  0.97482    
Walks        0.0186796  0.0147079   1.270  0.20407    
Years       -0.1128607  0.0927634  -1.217  0.22374    
CAtBat      -0.0006854  0.0011552  -0.593  0.55297    
CHits        0.0071548  0.0059495   1.203  0.22913    
CHmRun       0.0038043  0.0128131   0.297  0.76654    
CRuns       -0.0056003  0.0052602  -1.065  0.28703    
CRBI        -0.0028734  0.0058602  -0.490  0.62391    
CWalks       0.0003207  0.0027509   0.117  0.90719    
LeagueN      3.1376015  0.3388132   9.261  < 2e-16 ***
DivisionW   -0.0295131  0.3149171  -0.094  0.92533    
PutOuts     -0.0001025  0.0005727  -0.179  0.85799    
Assists     -0.0013616  0.0017130  -0.795  0.42669    
Errors      -0.0053971  0.0317883  -0.170  0.86518    
Salary      -0.0004174  0.0004950  -0.843  0.39903    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 281.214  on 203  degrees of freedom
Residual deviance:  91.837  on 184  degrees of freedom
  (54 observations deleted due to missingness)
AIC: 131.84

Number of Fisher Scoring iterations: 7

A matrix: 2 × 1 of type dbl
	N
A	0
N	1

glm.pred  A  N
       A 34  3
       N  1 26

0.9375

probit 与logistic的效果差不多

交叉验证法(待修改)

set.seed(123) 
n=10 #模型阶数
cv.error.10=rep(0,n)
# for (i in 1:n){
#   glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
#   cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
# }
for (i in 1:n){
  glm.fit=glm(NewLeague~.,data,family=binomial(link='probit'))
  cv.error.10[i]=cv.glm(data,glm.fit,K=10)$delta[1]
}
cv.error.10

Error in cv.glm(data, glm.fit, K = 10): could not find function "cv.glm"
Traceback:

library(nnet)

#建立模型
model<- nnet::multinom(NewLeague~AtBat+Hits+Walks+PutOuts,train.data)
summary(model)

# weights:  6 (5 variable)
initial  value 178.831973 
final  value 176.202102 
converged



Call:
nnet::multinom(formula = NewLeague ~ AtBat + Hits + Walks + PutOuts, 
    data = train.data)

Coefficients:
                   Values    Std. Err.
(Intercept)  0.1680834713 0.3600629960
AtBat       -0.0008943925 0.0033379907
Hits        -0.0027225518 0.0105738112
Walks        0.0034700966 0.0080661133
PutOuts      0.0004106612 0.0004562542

Residual Deviance: 352.4042 
AIC: 362.4042

predicted.classes<-model %>%
    predict(test.data)
head(predicted.classes)
mean(predicted.classes == test.data$NewLeague)

Levels:

0.53125

当减少变量时明显发现，准确率下降,猜测变量中有一个与因变量高度相关

线性判别分析

#将数据分成训练集和测试集:%>%将数据进行传递、
library(ipred)
set.seed(123)
data=na.omit(Hitters)
training.samples<- data$NewLeague %>%
    createDataPartition(p = 0.8, list = FALSE)
train.data <- data[training.samples,]
test.data <- data[-training.samples,]

#导入包时底层逻辑出现问题
unloadNamespace('caret')
unloadNamespace('recipes')
unloadNamespace('ipred')
#使用lda
library(MASS)

Attaching package: 'MASS'


The following object is masked from 'package:dplyr':

    select

(lda.fit=lda(NewLeague~.,train.data))
plot(lda.fit)

Call:
lda(NewLeague ~ ., data = train.data)

Prior probabilities of groups:
       A        N 
0.535545 0.464455 

Group means:
     AtBat     Hits     HmRun     Runs      RBI    Walks    Years   CAtBat
A 406.0885 109.1504 12.592920 57.00000 52.77876 41.07080 7.283186 2637.832
N 385.7041 102.1122  9.479592 49.59184 46.96939 40.52041 7.428571 2751.612
     CHits   CHmRun    CRuns     CRBI   CWalks    LeagueN DivisionW  PutOuts
A 715.6018 72.73451 367.2212 332.5664 258.3186 0.07964602 0.4778761 256.9646
N 747.1224 61.76531 358.2959 330.0918 263.3673 0.94897959 0.5204082 303.2551
   Assists   Errors   Salary
A 113.4425 8.362832 521.8844
N 123.2143 8.846939 537.6504

Coefficients of linear discriminants:
                    LD1
AtBat      0.0028002936
Hits      -0.0031493663
HmRun      0.0104988252
Runs      -0.0107130571
RBI       -0.0024390763
Walks      0.0048200984
Years     -0.0460721796
CAtBat    -0.0000254391
CHits      0.0014060531
CHmRun     0.0001388015
CRuns     -0.0015124610
CRBI      -0.0006446335
CWalks     0.0002057631
LeagueN    4.0456150691
DivisionW  0.0351811062
PutOuts    0.0003566468
Assists    0.0002141323
Errors    -0.0242090302
Salary    -0.0001118049

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IQTJfnTg-1680423860917)(output_76_1.png)]

lda.pred=predict(lda.fit,test.data)
paste('变量名称=',names(lda.pred))

lda.class=lda.pred$class
head(lda.class)
lda.posterior=lda.pred$posterior
head(lda.posterior)

'变量名称= class'
'变量名称= posterior'
'变量名称= x'

Levels:

A matrix: 6 × 2 of type dbl
	A	N
-Alvin Davis	0.9979368675	0.0020631325
-Andre Dawson	0.0007505105	0.9992494895
-Andres Galarraga	0.0003391899	0.9996608101
-Alfredo Griffin	0.9997526519	0.0002473481
-Andres Thomas	0.0017857792	0.9982142208
-Andre Thornton	0.9996359302	0.0003640698

#查看混淆矩阵
table(lda.class,test.data$NewLeague)

#计算预测的准确率
mean(lda.class==test.data$NewLeague)

#当后验概率使用50%的阈值时，重新预测，结果包含在lda.pred$class中。
sum(lda.pred$posterior[,1]>=0.5)  

sum(lda.pred$posterior[,1]<0.5)   
    
#注意到模型的后验概率对应着A的
lda.pred$posterior[1:20,1:2 ]
lda.class[1:20]

#，后验概率为90%
sum(lda.pred$posterior[,1]>=0.9)

lda.class  A  N
        A 27  3
        N  1 21

0.923076923076923

A matrix: 20 × 2 of type dbl
	A	N
-Alvin Davis	0.9979368675	0.0020631325
-Andre Dawson	0.0007505105	0.9992494895
-Andres Galarraga	0.0003391899	0.9996608101
-Alfredo Griffin	0.9997526519	0.0002473481
-Andres Thomas	0.0017857792	0.9982142208
-Andre Thornton	0.9996359302	0.0003640698
-Bob Brenly	0.0001754981	0.9998245019
-Bill Doran	0.0009425210	0.9990574790
-Brian Downing	0.9996047108	0.0003952892
-Bill Madlock	0.0016847021	0.9983152979
-Chris Brown	0.0029810078	0.9970189922
-Carlton Fisk	0.9995762139	0.0004237861
-Carney Lansford	0.9989394170	0.0010605830
-Darrell Evans	0.9987166364	0.0012833636
-Damaso Garcia	0.9995550027	0.0004449973
-Don Mattingly	0.9991239245	0.0008760755
-Eric Davis	0.0023883074	0.9976116926
-Eddie Milner	0.0011534457	0.9988465543
-George Bell	0.9997279118	0.0002720882
-Greg Brock	0.0002475808	0.9997524192

Levels:

#阈值设为0.9时有30个是满足的

贝叶斯

# # Naive Bayes分类 
library ( e1071 )
bayes.fit <- naiveBayes(NewLeague~.,train.data )
bayes.fit
bayes.pred <- predict ( bayes.fit , test.data)
table ( bayes.pred , test.data$NewLeague , dnn = c ( "Prediction" , "Actual" ) )
mean ( bayes.pred == test.data$NewLeague )

Naive Bayes Classifier for Discrete Predictors

Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)

A-priori probabilities:
Y
       A        N 
0.535545 0.464455 

Conditional probabilities:
   AtBat
Y       [,1]     [,2]
  A 406.0885 150.5719
  N 385.7041 148.2686

   Hits
Y       [,1]     [,2]
  A 109.1504 46.85849
  N 102.1122 44.18538

   HmRun
Y        [,1]     [,2]
  A 12.592920 9.304029
  N  9.479592 7.678098

   Runs
Y       [,1]     [,2]
  A 57.00000 26.96592
  N 49.59184 23.34450

   RBI
Y       [,1]     [,2]
  A 52.77876 28.06553
  N 46.96939 23.96581

   Walks
Y       [,1]     [,2]
  A 41.07080 23.26736
  N 40.52041 19.11292

   Years
Y       [,1]     [,2]
  A 7.283186 4.676058
  N 7.428571 5.058422

   CAtBat
Y       [,1]     [,2]
  A 2637.832 2198.567
  N 2751.612 2542.893

   CHits
Y       [,1]     [,2]
  A 715.6018 619.5558
  N 747.1224 726.1904

   CHmRun
Y       [,1]     [,2]
  A 72.73451 89.36422
  N 61.76531 75.37614

   CRuns
Y       [,1]     [,2]
  A 367.2212 327.0564
  N 358.2959 357.1627

   CRBI
Y       [,1]     [,2]
  A 332.5664 332.1341
  N 330.0918 331.1957

   CWalks
Y       [,1]     [,2]
  A 258.3186 257.3786
  N 263.3673 272.2331

   League
Y            A          N
  A 0.92035398 0.07964602
  N 0.05102041 0.94897959

   Division
Y           E         W
  A 0.5221239 0.4778761
  N 0.4795918 0.5204082

   PutOuts
Y       [,1]     [,2]
  A 256.9646 248.1815
  N 303.2551 278.1822

   Assists
Y       [,1]     [,2]
  A 113.4425 139.3094
  N 123.2143 146.8728

   Errors
Y       [,1]     [,2]
  A 8.362832 6.245087
  N 8.846939 6.221599

   Salary
Y       [,1]     [,2]
  A 521.8844 464.9681
  N 537.6504 448.5814




          Actual
Prediction  A  N
         A 27  4
         N  1 20

0.903846153846154

library ( ROCR )
rocplot <- function ( pred , truth , ... ) {
        predob <- prediction ( pred , truth )
        perf <- performance ( predob , "tpr", "fpr" )
        plot ( perf , ... )
        auc <- performance ( predob , "auc" )
        auc <- unlist ( slot ( auc , "y.values" ) )
        auc <- round ( auc , 4 ) #保留4位小数
        text ( x = 0.8 , y = 0.1 , labels = paste ( "AUC =" , auc ) )
        }
# ROCR包用于ROC曲线。为了画出roc曲线，需要讲所有输出变成概率值，不用的概率值输出的方式不一样，代码如下
# ROC曲线的预测输出
bayes.pred2 <- predict ( bayes.fit , test.data , type = "raw") [ , 2]    #bayes

par ( mfrow = c ( 2 , 2 ) )
y <- test.data$NewLeague
rocplot ( bayes.pred2 , y , main = "Bayes" )

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9rfP1zgd-1680423860920)(output_82_0.png)]