filter
filter(flights, month == 1, day == 1)
filter(flights, month==11 | month ==12)
filter(flights, month %in% c(11, 12))
#these two are the same;`month == 11|12` is invalid
#a way to check the number of NA
flights %>%
mutate(temp = is.na(dep_time) ) %>%
select(temp) %>%
table()
arrange
arrange(flights,arr_delay) #from smaller to bigger
arrange(flights, desc(arr_delay)) #from bigger to smaller
select
# Select columns by name
select(flights, year, month, day)
# Select all columns between year and day (inclusive)
select(flights, year:day)
# Select all columns except those from year to day (inclusive)
select(flights, -(year:day))
##rename
rename(flights, tail_num = tailnum) #turn the latter one into the previous one
##change the order of variables as set
stocks %>% select(year,half,return)
mutate
mutate(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
#keep the new variables only
transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100
)
summarise
flights %>%
group_by(year,month,day) %>%
summarise(average_delay=mean(dep_delay,na.rm=T))
#a way to specifically filter the data
flights %>%
group_by(year, month, day) %>%
summarise(
avg_delay1 = mean(arr_delay,na.rm=T),
avg_delay2 = mean(arr_delay[arr_delay > 0],na.rm=T) # the average positive delay
)
other orders
##Cumulative aggregates
x <- 1:10
cumsum(x)
cummean(x)
##Ranking
y <- c(10, 20, 20, NA, 30, 40)
min_rank(y)
min_rank(desc(y)) #little values obtain low ranks(large numbers of ranks)
rankme <- mutate(rankme,
x_row_number = row_number(x),
x_min_rank = min_rank(x),
x_dense_rank = dense_rank(x)
)
arrange(rankme, x)
#> # A tibble: 5 x 4
#> x x_row_number x_min_rank x_dense_rank
#> <dbl> <int> <int> <int>
#> 1 1 1 1 1
#> 2 5 2 2 2
#> 3 5 3 2 2
#> 4 5 4 2 2
#> 5 10 5 5 3
##show the extreme elements
flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay)) %>%
group_by(year, month, day) %>%
select(year,month,day,dep_time) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r)) # range: smallest and largest element
##The number of types
x <-c("a","a","b","c")
n_distinct(x)