R语言上课代码记录9

# Linear Regression
library("ggplot2")

data("mtcars")
# plot of data
ggplot(data = mtcars) +
  geom_point(aes(x=wt, y=mpg)) +
  theme_bw()

# linear model mpg = a*wt + b + error
# let a = -5, b = 40, then my model becomes mpg = -5*wt + 40 + error
ggplot(data = mtcars) +
  geom_point(aes(x=wt, y=mpg)) +
  geom_abline(intercept = 40, slope = -5, col="red")
  theme_bw()

# how good is my model? predicted_mpg = -5*wt + 40
# plot errors from your model
ggplot(data = mtcars) +
  geom_point(aes(x=wt, y=mpg)) +
  geom_abline(intercept = 40, slope = -5, col="red") +
  geom_point(aes(x=wt, y=(-5*wt + 40)), col="red") +
  geom_segment(aes(x=wt, xend=wt, y=mpg, yend=(-5*wt + 40)), alpha=0.1)+
  theme_bw()

# try another linear model predicted_mpg = -4*wt + 33
ggplot(data = mtcars) +
  geom_point(aes(x=wt, y=mpg)) +
  geom_abline(intercept = 40, slope = -5, col="red") +
  geom_point(aes(x=wt, y=(-5*wt + 40)), col="red") +
  geom_abline(intercept = 33, slope = -4, col="blue") +
  geom_point(aes(x=wt, y=(-4*wt + 33)), col="blue") +
  theme_bw()

# How to find the best linear model?
# We have samples(observations):
# (wt1, mpg1), (wt2, mpg2),......, (wt32, mpg32)
# The liner model has form of mpg = a*wt + b + error
# Given wt, my linear model will have a predicted mpg:
#       predicted_mpg(i) = a*wt + b, i= 1,2,...,32
# The difference between mpg and predicted_mpg is the prediction error:
#       mpg(i) = predicted_mpg(i) + error(i)
# which is equivalent to
#       error(i) = mpg(i) - predicted_mpg(i)
# From the graph, we find some errors are positive, some are negative
# To make all errors positive, we square them
#       error(i)^2 = (mpg(i) - predicted_mpg(i))^2
# The total errors is
# error(1)^2 + error(2)^2 +...+ error(32)^2
# which is also:
#       sum_{i=1 to 32} (mpg(i) - predicted_mpg(i))^2
# which is also:
#       sum_{i=1 to 32} (mpg(i) - (a*wt(i) + b))^2
# In our example, the above is 
# (21-a*2.62-b)^2 + (21-a*2.875-b)^2 + ... + (21.4-a*2.78-b)^2
# ---------------   ---------------          -----------------
#    error(1)^2   +    error(2)^2    + ... +     error(32)^2

# Let f(a,b) = sum_{i=1 to 32} (mpg(i) - (a*wt(i) + b))^2
# We want to find a* and b* that minimize f(a,b)
# First order condition (FOC):
#       df(a,b)/da = 0  and  df(a,b)/db = 0
# Slove the equation system, we can get:
# a*=(Sum_{i=1 to 32} (wt(i) - wt_avg)*(mpg(i) - mpg_avg))/(Sum_{i=1 to 32} (wt(i) - wt_avg)^2)
# b*= mpg_avg - (a*)*wt_avg

# linear regression in R
mymodel <- lm(mpg~wt, data = mtcars)
summary(mymodel) 
confint(mymodel) #求系数置信区间

ggplot(data = mtcars) +
  geom_point(aes(x=wt, y=mpg)) +
  geom_abline(intercept = 40, slope = -5, col="red") +
  geom_point(aes(x=wt, y=(-5*wt + 40)), col="red") +
  geom_abline(intercept = 33, slope = -4, col="blue") +
  geom_point(aes(x=wt, y=(-4*wt + 33)), col="blue") +
  geom_abline(intercept = 37.28, slope = -5.34, col="orange") +
  geom_point(aes(x=wt, y=(-5.34*wt + 37.28)), col="orange") +
  theme_bw()

# Two aspects of linear model:
# Predition (Predicted_y) in statistics
cols <- c("预测predict"='orange','actual'='black')
ggplot(data = mtcars) +
  geom_point(aes(x=wt, y=mpg,color='actual'),shape=18,size=3) +
  geom_abline(aes(intercept = 37.28, slope = -5.34, color="预测predict"),size=0.5) +
  geom_point(aes(x=wt, y=(-5.34*wt + 37.28),color="预测predict"),shape=18,size=4) +
  scale_color_manual(name='', values= cols) +
  theme_bw()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值