R数据分析多变量

最新推荐文章于 2024-05-20 15:07:00 发布

月夜星星雨

最新推荐文章于 2024-05-20 15:07:00 发布

阅读量1.9k

点赞数

分类专栏： R

本文链接：https://blog.csdn.net/m493096871/article/details/89532115

版权

R 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

仍然使用facebook用户数据

ggplot(aes(x=gender,y=age),
data=subset(pf,!is.na(gender)))+ geom_boxplot() +
stat_summary(fun.y=mean,geom= 'point',shape= 4)

ggplot(aes(x = age,y= friend_count),
data = subset(pf,!is.na(gender)))+
geom_line(aes(color= gender),stat= 'summary',fun.y=median)

suppressMessages(library(dplyr))

pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age,gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n=n()) %>%
ungroup() %>%
arrange(age)

head(pf.fc_by_age_gender)

ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)

ggplot(aes(x = age, y = median_friend_count),
data = pf.fc_by_age_gender) +
geom_line(aes(color = gender))

install.packages("tidyr")
library(tidyr)

spread(subset(pf.fc_by_age_gender,
select = c('gender', 'age', 'median_friend_count')),
gender, median_friend_count)

重塑数据

install.packages("reshape2")
library(reshape2)

pf.fc_by_age_gender.wide <- subset(pf.fc_by_age_gender[c('age', 'gender', 'median_friend_count')], !is.na(gender)) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = male / female)

head(pf.fc_by_age_gender.wide)

pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)

> head(pf.fc_by_age_gender.wide)
age female male
1 13 148.0 55.0
2 14 224.0 92.5
3 15 276.0 106.5
4 16 258.5 136.0
5 17 245.5 125.0

比率图

ggplot(aes(x=age,y= female / male),
data =pf.fc_by_age_gender.wide) +
geom_line() +
geom_hline(yintercept = 1,alpha = 0.3,linetype=2)

pf$year_joined <- floor(2014 - pf$tenure/365)
summary(pf$year_joined)
table(pf$year_joined)

summary(pf$year_joined)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
2005 2012 2012 2012 2013 2014 2
> table(pf$year_joined)

2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
9 15 581 1507 4557 5448 9860 33366 43588 70

pf$year_joined.bucket <- cut(pf$year_joined,
c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket,useNA = 'ifany')
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)

ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = median)

ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean) +
geom_line(stat='summary',fun.y= mean ,linetype=2)

将中位数化为虚线

总结好友率

with(subset(pf,tenure >=1),summary(friend_count / tenure))

Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0775 0.2205 0.6096 0.5658 417.0000

ggplot(aes(x=tenure,y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket))

ggplot(aes(x=tenure,y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket),
stat='summary',
fun.y=mean)

降噪方式1，理解偏差-方差折衷

x = tenure 替换为 x = 7 * round(tenure / 7)

ggplot(aes(x= 7 * round(tenure / 7),y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket),
stat='summary',
fun.y=mean)

ggplot(aes(x= 30 * round(tenure / 30),y=friendships_initiated /tenure),
data=subset(pf,tenure >=1)) +
geom_line(aes(color=year_joined.bucket),
stat='summary',
fun.y=mean)

下面分析酸奶数据集

https://s3.amazonaws.com/udacity-hosted-downloads/ud651/yogurt.csv

首先选择id转换为因子变量

yo <- read.csv('yogurt.csv')
str(yo)
$ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
yo$id <- factor(yo$id)
str(yo)

$ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...

qplot(data= yo ,x = price, fill = I ('#F79420'))

qplot(data= yo ,x = price, fill = I ('#F79420'),binwidth=10)

summary(yo)
unique(yo$price)
length(unique(yo$price))

summary(yo)
obs id time strawberry
Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
(Other):2061
blueberry pina.colada plain mixed.berry
Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000

price
Min. :20.00
1st Qu.:50.00
Median :65.04
Mean :59.25
3rd Qu.:68.96
Max. :68.96

> length(unique(yo$price))
[1] 20
> unique(yo$price)
[1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36 55.04
[13] 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52

创建购买数的新变量,这里第一个方法使用了传递函数

yo <- transform(yo,all.purchases = strawberry+
blueberry+pina.colada+plain+mixed.berry)

或者

yo$all.purchases <- yo$strawberry+
yo$blueberry+yo$pina.colada+yo$plain+yo$mixed.berry

summary(yo$all.purchases)

Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 2.000 1.971 2.000 21.000

qplot(x = all.purchases,data = yo, binwidth = 1,
fill = I('#099DD9'))

ggplot(aes(x = time, y = price),data = yo) +
geom_jitter(alpha = 1/4, shape = 21, fill = I('#F79420'))

x %in% y 返回一个长度与 x 相同的逻辑（布尔）向量，该向量指出 x 中的每一个条目是否都出现在 y 中。也就是说，对于 x 中的每一个条目，该向量都会检查这一条目是否也出现在 y 中。

这样，我们就能将数据子集化，从而获得样本中住户的所有购买时机了。然后，我们通过样本 ID 创建价格与时间的散点图和分面。

在绘制散点时，使用 pch 或 shape 参数来指定符号

set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)

ggplot(aes(x = time, y = price),
data = subset(yo, id %in% sample.ids))+
facet_wrap( ~ id ) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1 )

散点图矩阵

install.packages('GGally') 来安装包，以创建此特定的散点图矩阵。

如果图形需要很长时间才能呈现，或者如果你想查看散点图矩阵的其中一部分，你只需检查少量的变量。你可以使用以下代码或选择较少的变量。我们建议将性别（第 6 个变量）包含在内！

pf_subset <- pf[ , c(2:7)]

你可能还会发现：变量标签是在散点图矩阵的外边缘上，而非对角线上。如果你希望标签在对角线上，你可以在 ggpairs 命令中设置 axisLabels = 'internal' 参数。

install.packages('GGally')
library(GGally)

theme_set(theme_minimal(20))

set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000), ])

制作热图

下面使用基因数据集

https://s3.amazonaws.com/udacity-hosted-downloads/ud651/nci.tsv

library(reshape2)
nci <- read.table('nci.tsv')
colnames(nci) <- c(1:64)
nci.long.samp <- melt(as.matrix(nci[1:200, ]))
names(nci.long.samp) <- c('gene', 'case', 'value')
head(nci.long.samp)

ggplot (aes(y = gene, x = case, fill = value),
data = nci.long.samp ) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c('blue',
'red'))(100))

月夜星星雨

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
R数据分析多变量

仍然使用facebook用户数据ggplot(aes(x=gender,y=age), data=subset(pf,!is.na(gender)))+ geom_boxplot() + stat_summary(fun.y=mean,geom= 'point',shape= 4)ggplot(aes(x = age,y= friend_count), ...
复制链接

扫一扫

专栏目录