【R语言】线性回归模型

Vickie_wwq

于 2023-06-30 16:17:39 发布

阅读量116

点赞数

分类专栏： R语言自学笔记文章标签： r语言线性回归开发语言笔记

本文链接：https://blog.csdn.net/weixin_46155316/article/details/131446775

版权

R语言自学笔记专栏收录该内容

12 篇文章 3 订阅

订阅专栏

#数据集
install.packages("DAAG")
library(DAAG)
head(ais,n=3)

#载入包
library(e1071)
library(plyr)
library(ggplot2)

ais2 <- subset(ais,sex=="m")#只考虑男性
ais3 <- ais2[,c(3,4)]#仅关注血液特征
newdata <- rename(ais3,c("hg"="HEMAGLOBIN","hc"="HEMATOCRIT"))
str(newdata)

#检验数据集中是否存在缺失值
colSums(is.na(newdata))

summary(newdata)

图形分析

#图形分析
#箱线图
par(mfrow=c(1,2))
boxplot(newdata$HEMAGLOBIN,col = "yellow",border = "blue",main="HEMAGLOBIN boxplot",ylab="g per decaliter")
boxplot(newdata$HEMATOCRIT,col = "orange",border = "blue",main="HEMATOCRIT boxplot",ylab="percent values")
#异常值检测
boxplot.stats(newdata$HEMAGLOBIN)$out
# [1] 18.0 19.2 18.5 17.7
boxplot.stats(newdata$HEMATOCRIT)$out
# [1] 40.3 59.7

在这里插入图片描述

#直方图
library(ggplot2)
qplot(HEMAGLOBIN, data = newdata, geom="histogram", binwidth=0.5, 
      fill=I("azure4"), col=I("azure3")) +
  labs(title = "HEMAGLOBIN") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x ="Concentration (in g per decaliter)") +
  labs(y = "Frequency") +
  scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35,40,45,50), minor_breaks = NULL) +
  scale_x_continuous(breaks = c(10:25), minor_breaks = NULL) +
  geom_vline(xintercept = mean(newdata$HEMAGLOBIN), show_guide=TRUE, color
             ="red", labels="Average") +
  geom_vline(xintercept = median(newdata$HEMAGLOBIN), show_guide=TRUE, color
             ="blue", labels="Median")

qplot(HEMATOCRIT, data = newdata, geom="histogram", binwidth=1, 
      fill=I("azure4"), col=I("azure3")) +
  labs(title = "HEMATOCRIT") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x ="percent values") +
  labs(y = "Frequency") +
  scale_y_continuous(breaks = c(0,5,10,15,20,25), minor_breaks = NULL) +
  scale_x_continuous(breaks = c(30:65), minor_breaks = NULL) +
  geom_vline(xintercept = mean(newdata$HEMATOCRIT), show_guide=TRUE, color
             ="red", labels="Average") +
  geom_vline(xintercept = median(newdata$HEMATOCRIT), show_guide=TRUE, color
             ="blue", labels="Median")

在这里插入图片描述

#密度图
par(mfrow=c(1, 2))

plot(density(newdata$HEMAGLOBIN), main="density: HEMAGLOBIN", ylab="Frequency", 
     sub=paste("Skewness:", round(e1071::skewness(newdata$HEMAGLOBIN), 2)))
polygon(density(newdata$HEMAGLOBIN), col="yellow")

plot(density(newdata$HEMATOCRIT), main="density: HEMATOCRIT", ylab="Frequency", 
     sub=paste("Skewness:", round(e1071::skewness(newdata$HEMATOCRIT), 2)))
polygon(density(newdata$HEMATOCRIT), col="orange")

在这里插入图片描述

#散点图
qplot(HEMAGLOBIN, HEMATOCRIT, data = newdata,
      main = "HEMAGLOBIN and HEMATOCRIT relationship") +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_point(colour = "blue", size = 1.5) +
  scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) +
  scale_x_continuous(breaks = c(10:25), minor_breaks = NULL)

在这里插入图片描述

建立线性模型

#建立线性模型
qplot(HEMAGLOBIN, HEMATOCRIT, data = newdata,
      main = "HEMAGLOBIN and HEMATOCRIT relationship") +
  theme(plot.title = element_text(hjust = 0.5)) +
  stat_smooth(method="lm", col="red", size=1) +
  geom_point(colour = "blue", size = 1.5) +
  scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) +
  scale_x_continuous(breaks = c(10:25), minor_breaks = NULL)

set.seed(123) 
HEMAGLOBIN_CENT = scale(newdata$HEMAGLOBIN, center=TRUE, scale=FALSE)

qplot(HEMAGLOBIN_CENT, HEMATOCRIT, data = newdata,
      main = "HEMAGLOBIN_CENT and HEMATOCRIT relationship") +
  theme(plot.title = element_text(hjust = 0.5)) +
  stat_smooth(method="lm", col="red", size=1) +
  geom_point(colour = "blue", size = 1.5) +
  scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) +
  scale_x_continuous(breaks = c(-2,-1.5,-1,-0.5,0,0.5,1,1.5,2,2.5,3,3.5,4), minor_breaks = NULL)

mod1 = lm(HEMATOCRIT ~ HEMAGLOBIN_CENT, data = newdata)
summary(mod1)
# Call:
#   lm(formula = HEMATOCRIT ~ HEMAGLOBIN_CENT, data = newdata)
# 
# Residuals:
#   Min      1Q  Median      3Q     Max
# -3.4183 -0.7043 -0.0072  0.6049  5.0765
# 
# Coefficients:
#   Estimate Std. Error t value Pr(>|t|)
# (Intercept)      45.6500     0.1140  400.35   <2e-16 ***
#   HEMAGLOBIN_CENT   2.4605     0.1227   20.06   <2e-16 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 1.152 on 100 degrees of freedom
# Multiple R-squared:  0.801,	Adjusted R-squared:  0.799
# F-statistic: 402.4 on 1 and 100 DF,  p-value: < 2.2e-16

在这里插入图片描述

回归模型的图形诊断

#残差图
par(mfrow=c(1, 1))
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=1)

在这里插入图片描述

# QQ图
#用于查看数据是否满足正态分布假设
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=2)

在这里插入图片描述

#比例位置图
#用于显示标准化残差与预测值之间的关系
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=3)

在这里插入图片描述

#Residuals Leverage图
#用于度量数据的重要性
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=5)
#远离虚线的每个点都可以作为一个有影响力的点
#库克距离用于度量每个观察数据对回归系数的影响
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=4)
#库克距离大于1表明存在可能的异常值

在这里插入图片描述

预测模型

#预测模型
set.seed(123)
trainingRowIndex <- sample(1:nrow(newdata),0.7*nrow(newdata))
trainingData <- newdata[trainingRowIndex,]#训练集
testData <- newdata[-trainingRowIndex,]#测试集
#在训练数据上开发模型，并用其预测测试数据上的HEMATOCRIT
modTrain <- lm(HEMATOCRIT~HEMAGLOBIN,data = trainingData)
predict <- predict(modTrain,testData)
summary(modTrain)

act_pred <- data.frame(cbind(actuals=testData$HEMATOCRIT,predicteds=predict))
cor(act_pred)
# actuals predicteds
# actuals    1.0000000  0.8869261
# predicteds 0.8869261  1.0000000
head(act_pred,10)
#     actuals predicteds
# 101    46.8   46.45078
# 102    45.2   44.72653
# 103    46.6   46.45078
# 110    44.9   45.21918
# 111    47.8   46.94342
# 119    44.5   45.21918
# 124    46.5   45.46550
# 128    45.5   45.71182
# 135    40.7   41.03172
# 137    40.3   40.53908

#比较小的值与比较大的值 的商 的平均值 是一个很好的度量标准
min_max <- mean(apply(act_pred,1,min)/apply(act_pred,1,max))
print(min_max)
#[1] 0.9833499

#平均绝对百分比偏差
mape <- mean(abs((act_pred$predicteds-act_pred$actuals))/act_pred$actuals)
print(mape)
#[1] 0.01679113

抽样方法

k折叠交叉验证方法
将一组观察值随机地划分为大致相等大小的k个随机样本
首先每个部分保持为测试数据，将模型重新拟合用于预测删除的观察结果的剩余k-1部分。
然后计算均方误差来估计测试误差。

自己理解的
在这里插入图片描述

#抽样方法
#k折叠交叉验证方法
library(DAAG)
kflod <- CVlm(data = newdata,form.lm = formula(HEMATOCRIT~HEMAGLOBIN),m=5,
              dots = FALSE,seed = 123,legend.pos = "topleft",
              main="Cross Validation;k=5",
              plotit = TRUE,printit = FALSE)
attr(kflod,'ms')
# [1] 1.515335
#均方误差越小，拟合线越接近最佳拟合线