install.packages("tidyverse")
install.packages("caret")
install.packages("psych")
install.packages("car")
install.packages("splines")
install.packages("mgcv")
library(tidyverse)
library(caret)
library(psych)
library(car)
library(splines)
library(mgcv)
data("Boston", package = "MASS")# 加载数据
medv<-Boston$medv#自住房屋房价中位数
lstat<-Boston$lstat#地区中有多少房东属于低收入人群
corr.test(medv,lstat,use="complete",method="spearman",adjust="none")#呈高度负相关
set.seed(123)
training.samples<-Boston$medv%>%createDataPartition(p=0.8,list =FALSE)
train.data<-Boston[training.samples,]
test.data<-Boston[-training.samples,]# 将数据集分为训练集与验证集
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth()#using method = 'loess',局部加权回归
#线性模型
model<-lm(medv~lstat,data = train.data)
anova(model) #检验直线回归方程在总体中是否成立,p<0.05,方程成立
summary(model)
par(mfrow=c(2,2))
plot(model)
durbinWatsonTest(model)#独立性检验,P>0.05,则认为误差之间相互独立
ncvTest(model)#方差齐性判断,P>0.05,则认为方差齐
# 模型预测
predictions <- model %>% predict(test.data)
# 评估模型,均方根差 (root mean square error,缩写 RMSE ),R2
data.frame(RMSE = RMSE(predictions, test.data$medv),
R2 = R2(predictions, test.data$medv))
#看看线性拟合的图形
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)
###log转换
# 构建模型
model3<- lm(medv ~ log(lstat), data = train.data)
summary(model3)
# 模型预测
predictions3 <- model3 %>% predict(test.data)
# 评估模型
data.frame( RMSE = RMSE(predictions3, test.data$medv),
R2 = R2(predictions3, test.data$medv))
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ log(x))
#多项式回归
fit1<-lm(medv~lstat,data = train.data)
fit2<-lm(medv~poly(lstat,2,raw=TRUE),data = train.data)
fit3<-lm(medv~poly(lstat,3,raw=TRUE),data = train.data)
fit4<-lm(medv~poly(lstat,4,raw=TRUE),data = train.data)
fit5<-lm(medv~poly(lstat,5,raw=TRUE),data = train.data)
fit6<-lm(medv~poly(lstat,6,raw=TRUE),data = train.data)
fit7<-lm(medv~poly(lstat,7,raw=TRUE),data = train.data)
anova(fit1,fit2,fit3,fit4,fit5,fit6,fit7)
lm(medv ~ poly(lstat,6,raw=TRUE),data = train.data) %>% summary()
model1<-lm(medv~poly(lstat,6,raw=TRUE),data = train.data)
model2<-lm(medv~poly(lstat,5,raw=TRUE),data = train.data)
model3<-lm(medv~poly(lstat,4,raw=TRUE),data = train.data)
AIC(model1)
AIC(model2)
AIC(model3)
predictions1<-model1 %>% predict(test.data)
data.frame(RMSE=RMSE(predictions1, test.data$medv),
R2=R2(predictions1, test.data$medv))
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 6, raw = TRUE))
####样条回归
# 建立模型,确定节点
knots<-quantile(train.data$lstat, p = c(0.25, 0.5, 0.75))
model4<-lm(medv~bs(lstat,knots=knots),data = train.data)
summary(model4)
#分节点knots
model41<-lm(medv~bs(lstat,knots=c(20)),data = train.data)
summary(model41)
# 进行预测
predictions4<-model4%>%predict(test.data)
predictions4<-model4%>%predict(test.data)
#评估模型
data.frame(RMSE = RMSE(predictions4, test.data$medv),
R2 = R2(predictions4, test.data$medv))
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 3))
#广义相加模型
# 构建模型
model5<-gam(medv ~ s(lstat), data = train.data)
summary(model5)
# 进行预测
predictions5<- model5 %>% predict(test.data)
# 评估模型
data.frame(RMSE = RMSE(predictions5, test.data$medv),
R2 = R2(predictions5, test.data$medv))
ggplot(train.data, aes(lstat, medv) ) +
geom_point() +
stat_smooth(method = gam, formula = y ~ s(x))