R语言dplyr包主要功能:
- 按观察值拾取观察值(filter())。
- 对行重新排序(arrange())。
- 按变量名称拾取变量(选择()。
- 使用现有变量的函数(mutate())创建新变量。
- 将多个值向下折叠为一个摘要(summary())。
这些都可以与group_by()结合使用,这将每个函数的作用域从对整个数据集进行操作更改为对其进行分组操作。
1. filter 选择满足条件的行
#install.packages("tidyverse")
#install.packages("dplyr")
library(ggplot2)
library(dplyr)
# library(tidyverse)
library(nycflights13)
# ?flights
## filter 选择满足条件的行
filter(flights, month == 1, day == 1)
filter(flights, month == 11 | month == 12)
nov_dec <- filter(flights, month %in% c(11, 12))
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(starwars, hair_color == "none" & eye_color == "black")
df <- tibble(x = c(1, NA, 3))
filter(df, x > 1)
filter(df, is.na(x) | x > 1)
starwars %>% group_by(gender) %>% filter(mass > mean(mass, na.rm = TRUE))
2. arrange 按列值排列行
## arrange 按列值排列行
arrange(flights, year, month, day)
arrange(flights, desc(dep_delay)) # 降序
df <- tibble(x = c(5, 2, NA))
arrange(df, x)
arrange(df, desc(x))
3. select 根据名称和类型来选择列
## select 根据名称和类型来选择列
select(flights, year, month, day)
select(flights, year:day) # 选择year和day之间的所有列(包括自己)
select(flights, -(year:day))
rename(flights, tail_num = tailnum) # 列名重命名
# everything(): Matches all variables
select(flights, time_hour, air_time, everything())
select(flights, contains("time"))
select(flights, starts_with("arr"))
4. mutate 创建、修改和删除列
## mutate 创建、修改和删除列
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time)
mutate(flights_sml,
gain = dep_delay - arr_delay,
speed = distance / air_time * 60)
# 仅保留新创建的列(变量)
transmute(flights,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours)
# 不同于/, %/% (取整), %% (取余数)
transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100)
# 排序位置
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
min_rank(desc(y))
row_number(y)
percent_rank(y)
cume_dist(y)
5.summarise 创建新的数据框
## summarise 创建新的数据框
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
group_by(flights, year, month, day) %>%
summarise(delay = mean(dep_delay, na.rm = TRUE))
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay,na.rm = TRUE))
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(), # n() gives the current group size.当前分组的大小
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
# summarise 中的函数主要有:
# Center: mean(), median()
# Spread: sd(), IQR(), mad()
# Range: min(), max(), quantile()
# Position: first(), last(), nth(),
# Count: n(), n_distinct()
# Logical: any(), all()
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time)))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(n_early = sum(dep_time < 500))
not_cancelled %>%
ungroup() %>% # To removing grouping 取分组
summarise(flights = n())
6. 数据框合并
## 数据框合并函数
semi_join() return all rows from x with a match in y.
anti_join() return all rows from x without a match in y.
inner_join(): includes all rows in x and y.
left_join(): includes all rows in x.
right_join(): includes all rows in y.
full_join(): includes all rows in x or y.