R数据分析探索双变量

install.packages('dplyr')

使用平均值

一般使用散点图研究两个变量之间的关系

pf<-read.delim('pseudo_facebook.tsv')


ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_point()

qplot(age,friend_count,data=pf)

设置每20个点等效为1个点alpha=1/20   限定x取值范围

ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_point(alpha= 1/20 )+
   xlim( 13,90 )

 

加入抖动噪声  jitter

ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_jitter(alpha= 1/20 )+
   xlim( 13,90 )

 

防止好友数出现0或者虚数,将异常值传递给0,(h=0)并且对好友值取平方根,加入抖动噪声jitter

ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_point(alpha= 1/20 ,position = position_jitter(h = 0))+
   xlim( 13,90 )+
  coord_trans(y = 'sqrt')

0.4.0 版本的 dplyr 在汇总层上使用中间值函数时有一个错误,具体取决于被汇总的数据性质。在本地计算机上使用该包时,你可能需要将数据转换为数值型(浮点型),比如:median(as.numeric(var))

 

library(dplyr)
library(ggplot2)

age_groups <- group_by(pf,age)
pf.fc_by_age <- summarise(age_groups,
          friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n())
pf.fc_by_age <- arrange(pf.fc_by_age,age)
head(pf.fc_by_age)

或者(在较新版本的 dplyr (0.3.x+) 中,语法 %.% 已被弃用且替换为 %>%)

pf.fc_by_age <- pf %>%
  group_by(age) %>%
  summarise(age_groups,
            friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age)
head(pf.fc_by_age)

 head(pf.fc_by_age)
# A tibble: 6 x 4
    age friend_count_mean friend_count_median     n
  <int>             <dbl>               <dbl> <int>
1    13              165.                 74    484
2    14              251.                132   1925
3    15              348.                161   2618
4    16              352.                172.  3086
5    17              350.                156   3283
6    18              331.                162   5196
>

 

ggplot(aes(age,friend_count_mean),data=pf.fc_by_age)+
  geom_point()


ggplot(aes(age,friend_count_mean),data=pf.fc_by_age)+
  geom_line()

 

 

ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_point(alpha= 1/10 ,
             position = position_jitter(h = 0),
             color='orange')+
  xlim( 13,90 )+
  coord_trans(y = 'sqrt')

ggplot(aes(x = age, y = friend_count), data = pf) +
  geom_point(alpha= 1/10 ,
             position = position_jitter(h = 0),
             color='orange')+
  xlim( 13,90 )+
  coord_trans(y = 'sqrt') +
  geom_line(stat = 'summary', fun.y = mean)

 

 

ggplot(aes(x = age, y = friend_count), data = pf) +
  xlim = c(13, 90)+
  geom_point(alpha= 0.05 ,
             position = position_jitter(h = 0),
             color='orange')+
  coord_trans(y = 'sqrt') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .1),
            linetype= 2, color = 'blue') +
  geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .5),
            linetype= 2, color = 'blue') +
  geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .9),
            linetype= 2, color = 'blue')

ggplot(aes(x = age, y = friend_count), data = pf) +
  coord_cartesian(xlim = c(13, 90),ylim = c(0,1000))+
  geom_point(alpha= 0.05 ,
             position = position_jitter(h = 0),
             color='orange')+
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .1),
            linetype= 2, color = 'blue') +
  geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .5),
            linetype= 2, color = 'blue') +
  geom_line(stat ='summary', fun.y =quantile, fun.args = list(probs = .9),
            linetype= 2, color = 'blue')

 

得到相关系数下面两个式子相等

cor.test(pf$age, pf$friend_count, method = 'pearson')
with(pf,cor.test(age,friend_count,method='pearson'))

data:  age and friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.03363072 -0.02118189
sample estimates:
        cor 
-0.02740737 

默认是pearson方法 ,可以省略

with(subset(pf,age <=70),cor.test(age,friend_count,method='pearson'))
 

with(subset(pf,age <=70),cor.test(age,friend_count,method= 'spearman'))
 

names(pf)
ggplot(aes(x= www_likes_received,y= likes_received),data= pf)+
  geom_point()

拟合曲线

首先计算相关性

cor.test(pf$www_likes_received,pf$likes_received)

发现0.9,有很强的相关性,之后进行曲线拟合

ggplot(aes(x= www_likes_received,y= likes_received),data= pf)+
  geom_point() +
  xlim(0,quantile(pf$www_likes_received,0.95))+
  ylim(0,quantile(pf$likes_received,0.95)) +
  geom_smooth(method = 'lm',color='red')
 

 

install.packages('alr3')
library(alr3)

data(Mitchell)
?Mitchell

下面两个相等

ggplot(data=Mitchell  ,aes(x=Month,y=Temp)) +
  geom_point()
qplot(data=Mitchell,Month,Temp)

data <- Mitchell
cor.test(data$Month,data$Temp)

将x轴变清晰

ggplot(data=Mitchell  ,aes(x=Month,y=Temp)) +
  geom_point() +
  scale_x_continuous(breaks=seq(0,203,12))

很像正弦函数

 

ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+ 
  geom_point() 

ggplot(aes(x=age,y=friend_count_mean),data=pf.fc_by_age)+geom_line()

可以看出上图噪声很多(不平滑部分)

head(pf.fc_by_age,10)
pf.fc_by_age[17:19,]

 

pf$age_with_months <- pf$age + (1 - pf$dob_month / 12) 

pf$age_with_months <- with(pf, age + (1 - dob_month / 12))

 

library(dplyr)

pf.fc_by_age_months <- pf %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median =median(friend_count),
            n = n()) %>%
  arrange(age_with_months)
head(pf.fc_by_age_months)

 

age_with_months_groups <- group_by(pf,age_with_months)
pf.fc_by_age_months2 <- summarise(age_with_months_groups,
                                  friend_count_mean = mean(friend_count),
                                  friend_count_median =median(friend_count),
                                  n = n())

pf.fc_by_age_months2 <- arrange(pf.fc_by_age_months2,age_with_months)
head(pf.fc_by_age_months2)
 

年龄小于71岁的人随出生月的变化

ggplot(aes(x= age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months ,age_with_months <71))+
  geom_line()

 

 

p1 <-ggplot(aes(x= age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months ,age_with_months <71))+
  geom_line()

p2 <-ggplot(aes(x=age,y=friend_count_mean),
       data=subset(pf.fc_by_age,age<71))+
  geom_line()
library(gridExtra)
grid.arrange(p2,p1,ncol=1)

p1 <-ggplot(aes(x= age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months ,age_with_months <71))+
  geom_line()

p2 <-ggplot(aes(x=age,y=friend_count_mean),
       data=subset(pf.fc_by_age,age<71))+
  geom_line()

p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
             data=subset(pf,age<71)) +
  geom_line(stat = 'summary',fun.y=mean)


library(gridExtra)
grid.arrange(p3,p2,p1,ncol=1)

p1 <-ggplot(aes(x= age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months ,age_with_months <71))+
  geom_line()+
  geom_smooth()

p2 <-ggplot(aes(x=age,y=friend_count_mean),
       data=subset(pf.fc_by_age,age<71))+
  geom_line()+
  geom_smooth()

p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
             data=subset(pf,age<71)) +
  geom_line(stat = 'summary',fun.y=mean)


library(gridExtra)
grid.arrange(p3,p2,p1,ncol=1)

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值