1.多元线性回归
states < - as. data. frame ( state. x77[ , c ( "Murder" , "Population" , "Illiteracy" , "Income" , "Frost" ) ] )
fit < - lm ( Murder ~ Population + Illiteracy + Income + Frost, data= states)
summary ( fit)
coef ( fit) # 输出回归参数
confint ( fit) # 提供模型参数的置信区间(默认 95 % )
fitted ( fit) # 列出拟合模型的预测值
residuals ( fit) # 列出拟合模型的残差值
rstandard ( fit) # 标准化残差
rstudent ( fit) # 学生化残差: SREi
AIC ( fit) # 赤池信息
2.基本假设检验
library ( car)
library ( ggpubr)
res = rstudent ( fit) #学生化残差
ggqqplot ( res, color = '#E7B800' )
durbinWatsonTest ( fit) # p> 0.05 表明应变量相互独立
ncvTest ( fit) # p> 0.05 表明同方差
3.异常值观测
outlierTest ( fit) #若不显著,则说明数据集中没有离群点, 一般会去除离群点
高杠杆值点,hii大于2倍或者3倍hii平均值(p+1)/n
hii = hatvalues ( fit)
high_leverage_point = hii[ hii >= 2 * ( 4 + 1 ) / length ( hii) ]
强影响点,库克距离Di > 4/(n-p-1),或者Di > 1,若有强影响点,则去除
Di = cooks. distance ( fit)
Di[ Di > 1 ]
4.多重共线性
vif ( fit) # VIF> 4 : 存在多重共线性,VIF> 10 : 存在严重多重共线性
5.变量选择,逐步回归
library ( MASS)
AIC ( fit)
stepAIC ( fit, direction = 'both' )
6.确定最终函数
states = states[ - grep ( 'Nevada' , rownames ( states) ) , ]
final_fit = lm ( Murder~ Population+ Illiteracy, data = states)
summary ( final_fit)
7.在新的数据集上预测
df = data. frame ( Population = c ( 360 , 2200 ) ,
Illiteracy = c ( 1.4 , 1.9 ) )
myfunction = function ( x1) {
sum ( coef ( fit) [ 2 : length ( coef ( final_fit) ) ] * x1) + coef ( final_fit) [ 1 ]
}
apply ( df, 1 , myfunction) # 预测新的数据集