(1)dplyr包使用do来处理分组汇总函数返回多值的问题
(2)lapply函数与dplyr的效率比较
lapply函数使用之前需要对原始数据拆分为list的结构(使用split函数),而拆分的过程当数据量稍大时速度很慢
# 计算分组下一个变量的分位数
df <- data.frame(group=sample(c("A", "B"), 100, replace=T),
value=rnorm(100, 2, 5))
df %>%
group_by(group) %>%
do(setNames(data.frame(t(quantile(.$value,
probs=seq(0, 1, 0.1)))),
paste0("Q", seq(0, 100, by=10))))
# Source: local data frame [2 x 12]
# Groups: group
#
# group Q0 Q10 Q20 Q30 Q40 Q50 Q60 Q70 Q80 Q90 Q100
# 1 A -13.46 -4.185 -2.881 -0.7118 1.4251 1.934 3.292 4.478 6.212 10.25 12.98
# 2 B -11.41 -4.900 -3.105 -0.9380 0.7811 1.431 2.158 2.884 4.002 6.39 11.73
# 扩展一个向量的所有组合的函数expand.grid
expandFun <- function(x){
expand.grid(col1=x, col2=x)
}
expandFun(LETTERS[1:4])
# col1 col2
# 1 A A
# 2 B A
# 3 C A
# 4 D A
# 5 A B
# 6 B B
# 7 C B
# 8 D B
# 9 A C
# 10 B C
# 11 C C
# 12 D C
# 13 A D
# 14 B D
# 15 C D
# 16 D D
########
# 实验:分组实施该函数
# 实验数据
by_species <- iris %>%
mutate(Sample=sample(LETTERS[1:4], size=150, replace = T)) %>%
group_by(Species) %>%
select(Species, Sample) %>%
unique() %>%
group_by(Species)
# Source: local data frame [12 x 2]
# Groups: Species
#
# Species Sample
# 1 setosa D
# 2 setosa B
# 3 setosa A
# 4 setosa C
# 5 versicolor B
# 6 versicolor D
# 7 versicolor A
# 8 versicolor C
# 9 virginica C
# 10 virginica B
# 11 virginica A
# 12 virginica D
# 以Species字段分组对Sample字段扩展
df_by <- as.data.frame(by_species)
df_by %>% split(f=df_by$Species) %>%
lapply(FUN=function(x){
expandFun(x$Sample)})
# $setosa
# col1 col2
# 1 D D
# 2 B D
# 3 A D
# 4 C D
# 5 D B
# 6 B B
# 7 A B
# 8 C B
# 9 D A
# 10 B A
# 11 A A
# 12 C A
# 13 D C
# 14 B C
# 15 A C
# 16 C C
#
# $versicolor
# col1 col2
# 1 B B
# 2 D B
# 3 A B
# 4 C B
# 5 B D
# 6 D D
# 7 A D
# 8 C D
# 9 B A
# 10 D A
# 11 A A
# 12 C A
# 13 B C
# 14 D C
# 15 A C
# 16 C C
#
# $virginica
# col1 col2
# 1 C C
# 2 B C
# 3 A C
# 4 D C
# 5 C B
# 6 B B
# 7 A B
# 8 D B
# 9 C A
# 10 B A
# 11 A A
# 12 D A
# 13 C D
# 14 B D
# 15 A D
# 16 D D
system.time(df_by %>% split(f=df_by$Species) %>%
lapply(FUN=function(x){
expandFun(x$Sample)}))
# 用户 系统 流逝
# 0.02 0.00 0.02
## 使用dplyr包的do函数,需要将返回的结果直接保存为data.frame,并且效率更高
by_species %>%
group_by(Species) %>%
do(data.frame(expandFun(.$Sample)))
# Source: local data frame [48 x 3]
# Groups: Species
#
# Species col1 col2
# 1 setosa D D
# 2 setosa B D
# 3 setosa A D
# 4 setosa C D
# 5 setosa D B
# 6 setosa B B
# 7 setosa A B
# 8 setosa C B
# 9 setosa D A
# 10 setosa B A
# 11 setosa A A
# 12 setosa C A
# 13 setosa D C
# 14 setosa B C
# 15 setosa A C
# 16 setosa C C
# 17 versicolor B B
# 18 versicolor D B
# 19 versicolor A B
# 20 versicolor C B
# 21 versicolor B D
# 22 versicolor D D
# 23 versicolor A D
# 24 versicolor C D
# 25 versicolor B A
# 26 versicolor D A
# 27 versicolor A A
# 28 versicolor C A
# 29 versicolor B C
# 30 versicolor D C
# 31 versicolor A C
# 32 versicolor C C
# 33 virginica C C
# 34 virginica B C
# 35 virginica A C
# 36 virginica D C
# 37 virginica C B
# 38 virginica B B
# 39 virginica A B
# 40 virginica D B
# 41 virginica C A
# 42 virginica B A
# 43 virginica A A
# 44 virginica D A
# 45 virginica C D
# 46 virginica B D
# 47 virginica A D
# 48 virginica D D
system.time(by_species %>%
group_by(Species) %>%
do(data.frame(expandFun(.$Sample))))
# 用户 系统 流逝
# 0 0 0 <pre code_snippet_id="644766" snippet_file_name="blog_20150415_2_3591916" name="code" class="plain" style="font-size: 13.3333339691162px;"><pre code_snippet_id="644766" snippet_file_name="blog_20150415_1_1427398" name="code" class="plain" style="font-size: 13.3333339691162px;"># <span style="font-size: 13.3333339691162px; font-family: Arial, Helvetica, sans-serif;">当数据量较大时速度的差别明显</span>
参考:
类似问题data.table也可以实现且性能也较好