#数据集
install.packages("DAAG")
library(DAAG)
head(ais,n=3)
#载入包
library(e1071)
library(plyr)
library(ggplot2)
ais2 <- subset(ais,sex=="m")#只考虑男性
ais3 <- ais2[,c(3,4)]#仅关注血液特征
newdata <- rename(ais3,c("hg"="HEMAGLOBIN","hc"="HEMATOCRIT"))
str(newdata)
#检验数据集中是否存在缺失值
colSums(is.na(newdata))
summary(newdata)
图形分析
#图形分析
#箱线图
par(mfrow=c(1,2))
boxplot(newdata$HEMAGLOBIN,col = "yellow",border = "blue",main="HEMAGLOBIN boxplot",ylab="g per decaliter")
boxplot(newdata$HEMATOCRIT,col = "orange",border = "blue",main="HEMATOCRIT boxplot",ylab="percent values")
#异常值检测
boxplot.stats(newdata$HEMAGLOBIN)$out
# [1] 18.0 19.2 18.5 17.7
boxplot.stats(newdata$HEMATOCRIT)$out
# [1] 40.3 59.7
#直方图
library(ggplot2)
qplot(HEMAGLOBIN, data = newdata, geom="histogram", binwidth=0.5,
fill=I("azure4"), col=I("azure3")) +
labs(title = "HEMAGLOBIN") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x ="Concentration (in g per decaliter)") +
labs(y = "Frequency") +
scale_y_continuous(breaks = c(0,5,10,15,20,25,30,35,40,45,50), minor_breaks = NULL) +
scale_x_continuous(breaks = c(10:25), minor_breaks = NULL) +
geom_vline(xintercept = mean(newdata$HEMAGLOBIN), show_guide=TRUE, color
="red", labels="Average") +
geom_vline(xintercept = median(newdata$HEMAGLOBIN), show_guide=TRUE, color
="blue", labels="Median")
qplot(HEMATOCRIT, data = newdata, geom="histogram", binwidth=1,
fill=I("azure4"), col=I("azure3")) +
labs(title = "HEMATOCRIT") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x ="percent values") +
labs(y = "Frequency") +
scale_y_continuous(breaks = c(0,5,10,15,20,25), minor_breaks = NULL) +
scale_x_continuous(breaks = c(30:65), minor_breaks = NULL) +
geom_vline(xintercept = mean(newdata$HEMATOCRIT), show_guide=TRUE, color
="red", labels="Average") +
geom_vline(xintercept = median(newdata$HEMATOCRIT), show_guide=TRUE, color
="blue", labels="Median")
#密度图
par(mfrow=c(1, 2))
plot(density(newdata$HEMAGLOBIN), main="density: HEMAGLOBIN", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(newdata$HEMAGLOBIN), 2)))
polygon(density(newdata$HEMAGLOBIN), col="yellow")
plot(density(newdata$HEMATOCRIT), main="density: HEMATOCRIT", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(newdata$HEMATOCRIT), 2)))
polygon(density(newdata$HEMATOCRIT), col="orange")
#散点图
qplot(HEMAGLOBIN, HEMATOCRIT, data = newdata,
main = "HEMAGLOBIN and HEMATOCRIT relationship") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_point(colour = "blue", size = 1.5) +
scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) +
scale_x_continuous(breaks = c(10:25), minor_breaks = NULL)
建立线性模型
#建立线性模型
qplot(HEMAGLOBIN, HEMATOCRIT, data = newdata,
main = "HEMAGLOBIN and HEMATOCRIT relationship") +
theme(plot.title = element_text(hjust = 0.5)) +
stat_smooth(method="lm", col="red", size=1) +
geom_point(colour = "blue", size = 1.5) +
scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) +
scale_x_continuous(breaks = c(10:25), minor_breaks = NULL)
set.seed(123)
HEMAGLOBIN_CENT = scale(newdata$HEMAGLOBIN, center=TRUE, scale=FALSE)
qplot(HEMAGLOBIN_CENT, HEMATOCRIT, data = newdata,
main = "HEMAGLOBIN_CENT and HEMATOCRIT relationship") +
theme(plot.title = element_text(hjust = 0.5)) +
stat_smooth(method="lm", col="red", size=1) +
geom_point(colour = "blue", size = 1.5) +
scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) +
scale_x_continuous(breaks = c(-2,-1.5,-1,-0.5,0,0.5,1,1.5,2,2.5,3,3.5,4), minor_breaks = NULL)
mod1 = lm(HEMATOCRIT ~ HEMAGLOBIN_CENT, data = newdata)
summary(mod1)
# Call:
# lm(formula = HEMATOCRIT ~ HEMAGLOBIN_CENT, data = newdata)
#
# Residuals:
# Min 1Q Median 3Q Max
# -3.4183 -0.7043 -0.0072 0.6049 5.0765
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 45.6500 0.1140 400.35 <2e-16 ***
# HEMAGLOBIN_CENT 2.4605 0.1227 20.06 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 1.152 on 100 degrees of freedom
# Multiple R-squared: 0.801, Adjusted R-squared: 0.799
# F-statistic: 402.4 on 1 and 100 DF, p-value: < 2.2e-16
回归模型的图形诊断
#残差图
par(mfrow=c(1, 1))
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=1)
# QQ图
#用于查看数据是否满足正态分布假设
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=2)
#比例位置图
#用于显示标准化残差与预测值之间的关系
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=3)
#Residuals Leverage图
#用于度量数据的重要性
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=5)
#远离虚线的每个点都可以作为一个有影响力的点
#库克距离用于度量每个观察数据对回归系数的影响
plot(mod1, pch=16, col="blue", lty=1, lwd=2, which=4)
#库克距离大于1表明存在可能的异常值
预测模型
#预测模型
set.seed(123)
trainingRowIndex <- sample(1:nrow(newdata),0.7*nrow(newdata))
trainingData <- newdata[trainingRowIndex,]#训练集
testData <- newdata[-trainingRowIndex,]#测试集
#在训练数据上开发模型,并用其预测测试数据上的HEMATOCRIT
modTrain <- lm(HEMATOCRIT~HEMAGLOBIN,data = trainingData)
predict <- predict(modTrain,testData)
summary(modTrain)
act_pred <- data.frame(cbind(actuals=testData$HEMATOCRIT,predicteds=predict))
cor(act_pred)
# actuals predicteds
# actuals 1.0000000 0.8869261
# predicteds 0.8869261 1.0000000
head(act_pred,10)
# actuals predicteds
# 101 46.8 46.45078
# 102 45.2 44.72653
# 103 46.6 46.45078
# 110 44.9 45.21918
# 111 47.8 46.94342
# 119 44.5 45.21918
# 124 46.5 45.46550
# 128 45.5 45.71182
# 135 40.7 41.03172
# 137 40.3 40.53908
#比较小的值与比较大的值 的商 的平均值 是一个很好的度量标准
min_max <- mean(apply(act_pred,1,min)/apply(act_pred,1,max))
print(min_max)
#[1] 0.9833499
#平均绝对百分比偏差
mape <- mean(abs((act_pred$predicteds-act_pred$actuals))/act_pred$actuals)
print(mape)
#[1] 0.01679113
抽样方法
k折叠交叉验证方法
将一组观察值随机地划分为大致相等大小的k个随机样本
首先每个部分保持为测试数据,将模型重新拟合用于预测删除的观察结果的剩余k-1部分。
然后计算均方误差来估计测试误差。
自己理解的
#抽样方法
#k折叠交叉验证方法
library(DAAG)
kflod <- CVlm(data = newdata,form.lm = formula(HEMATOCRIT~HEMAGLOBIN),m=5,
dots = FALSE,seed = 123,legend.pos = "topleft",
main="Cross Validation;k=5",
plotit = TRUE,printit = FALSE)
attr(kflod,'ms')
# [1] 1.515335
#均方误差越小,拟合线越接近最佳拟合线