普通最小二乘法OLS
y=ax+b f(e)=sum((y-y')^2)
b=cor(x,y)/var(x) #证明省略 协方差/方差
a=mean(y)-b*mean(x)
person相关系数
R=cor(x,y)
多元线性回归
Y=XB+E
YBE为向量X为带偏执矩阵--最小二次法求系数
B=solve((t(x)%*%x))%*%t(x)%*%y
估计B的函数
reg <- function(y, x) {
x <- as.matrix(x)
x <- cbind(Intercept = 1, x) #Intercept = 1命名为Intercept且该列全为1
b <- solve(t(x) %*% x) %*% t(x) %*% y
colnames(b) <- "estimate" #estimate估计
print(b)
}
reg(y = launch$distress_ct, x = launch[2:4])
model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num , data = launch)
对比结果一致
利用直方图看分布
hist(insurance$expenses)
利用table统计出现个数
table(insurance$region)
改进的散点图矩阵--数值是相关系数-椭圆越拉伸相关性越强-红色的线为拟合曲线
library(psych)
pairs.panels(insurance[c("age", "bmi", "children", "expenses")])
线性回归模型lm()默认stats包里
复相关系数R--皮尔逊相关
决定系数R方-判定模型介绍因变量的值的程度-越接近1越好=复相关系数的平方
R方=cor(pre,real)^2
提升模型的性能
1添加非线性关系x^2
2转换数值型转换为二进制指标ifelse函数
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
3加入相互作用的影响bmi30*smoker 乘法关系 或者bmi30:smoker 冒号显示之间有相互作用
lm(expenses ~ age + bmi30*smoker, data = insurance)
lm(expenses ~ age + bmi30:smoker, data = insurance)
4全部放在一起
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
y=ax+b f(e)=sum((y-y')^2)
b=cor(x,y)/var(x) #证明省略 协方差/方差
a=mean(y)-b*mean(x)
person相关系数
R=cor(x,y)
多元线性回归
Y=XB+E
YBE为向量X为带偏执矩阵--最小二次法求系数
B=solve((t(x)%*%x))%*%t(x)%*%y
估计B的函数
reg <- function(y, x) {
x <- as.matrix(x)
x <- cbind(Intercept = 1, x) #Intercept = 1命名为Intercept且该列全为1
b <- solve(t(x) %*% x) %*% t(x) %*% y
colnames(b) <- "estimate" #estimate估计
print(b)
}
reg(y = launch$distress_ct, x = launch[2:4])
model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num , data = launch)
对比结果一致
利用直方图看分布
hist(insurance$expenses)
利用table统计出现个数
table(insurance$region)
改进的散点图矩阵--数值是相关系数-椭圆越拉伸相关性越强-红色的线为拟合曲线
library(psych)
pairs.panels(insurance[c("age", "bmi", "children", "expenses")])
线性回归模型lm()默认stats包里
复相关系数R--皮尔逊相关
决定系数R方-判定模型介绍因变量的值的程度-越接近1越好=复相关系数的平方
R方=cor(pre,real)^2
提升模型的性能
1添加非线性关系x^2
2转换数值型转换为二进制指标ifelse函数
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
3加入相互作用的影响bmi30*smoker 乘法关系 或者bmi30:smoker 冒号显示之间有相互作用
lm(expenses ~ age + bmi30*smoker, data = insurance)
lm(expenses ~ age + bmi30:smoker, data = insurance)
4全部放在一起
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, data = insurance)
launch <- read.csv("challenger.csv")
# estimate beta manually
b <- cov(launch$temperature, launch$distress_ct) / var(launch$temperature)
b
# estimate alpha manually
a <- mean(launch$distress_ct) - b * mean(launch$temperature)
a
# calculate the correlation of launch data
r <- cov(launch$temperature, launch$distress_ct) /
(sd(launch$temperature) * sd(launch$distress_ct))
r
cor(launch$temperature, launch$distress_ct)
# computing the slope using correlation
r * (sd(launch$distress_ct) / sd(launch$temperature))
# confirming the regression line using the lm function (not in text)
model <- lm(distress_ct ~ temperature, data = launch)
model
summary(model)
# creating a simple multiple regression function
reg <- function(y, x) {
x <- as.matrix(x)
x <- cbind(Intercept = 1, x)
b <- solve(t(x) %*% x) %*% t(x) %*% y
colnames(b) <- "estimate"
print(b)
}
# examine the launch data
str(launch)
# test regression model with simple linear regression
reg(y = launch$distress_ct, x = launch[2])
# use regression model with multiple regression
reg(y = launch$distress_ct, x = launch[2:4])
# confirming the multiple regression result using the lm function (not in text)
#model <- lm(distress_ct ~ temperature + pressure + launch_id, data = launch)
model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num , data = launch)
model
## Example: Predicting Medical Expenses ----
## Step 2: Exploring and preparing the data ----
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)
# summarize the charges variable
summary(insurance$expenses)
# histogram of insurance charges利用直方图看分布
hist(insurance$expenses)
# table of region利用table统计出现个数
table(insurance$region)
# exploring relationships among features: correlation matrix
cor(insurance[c("age", "bmi", "children", "expenses")])
# visualing relationships among features: scatterplot matrix两两散点图矩阵
pairs(insurance[c("age", "bmi", "children", "expenses")])
# more informative scatterplot matrix改进的散点图矩阵
library(psych)
pairs.panels(insurance[c("age", "bmi", "children", "expenses")])
## Step 3: Training a model on the data ----
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
data = insurance)
ins_model <- lm(expenses ~ ., data = insurance) # this is equivalent to above
# see the estimated beta coefficients
ins_model
## Step 4: Evaluating model performance ----
# see more detail about the estimated beta coefficients
summary(ins_model)
## Step 5: Improving model performance ----
# add a higher-order "age" term
insurance$age2 <- insurance$age^2
# add an indicator for BMI >= 30
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
# create final model
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, data = insurance)
summary(ins_model2)