使用Rstdio创建和处理R Markdowm

---
title: "Going deeper with dplyr"
author: "xiexin 20190001"
date: "`r Sys.Date()`"
output:
  html_document:
    df_print: paged
    #
  pdf_document:
    extra_dependencies:
      ctex: UTF8
    latex_engine: xelatex
    number_sections: yes
classoptions: hyperref, 12pt, a4paper
---


# Loading dplyr and the nycflights13 dataset


```{r}
# load packages
suppressMessages(library(dplyr))
library(nycflights13)

# print the flights dataset from nycflights13
flights

Choosing columns: select, rename

# besides just using select() to pick columns...
flights %>% select(carrier, flight)

# ...you can use the minus sign to hide columns
flights %>% select(-month, -day)
# hide a range of columns
flights %>% select(-(dep_time:arr_delay))

# hide any column with a matching name
flights %>% select(-contains("time"))
# pick columns using a character vector of column names
cols <- c("carrier", "flight", "tailnum")
flights %>% select(one_of(cols))
# select() can be used to rename columns, though all columns not mentioned are dropped
flights %>% select(tail = tailnum)

# rename() does the same thing, except all columns not mentioned are kept
flights %>% rename(tail = tailnum)

Choosing rows: filter, between, slice, sample_n, top_n, distinct

# filter() supports the use of multiple conditions
flights %>% filter(dep_time >= 600, dep_time <= 605)
# between() is a concise alternative for determing if numeric values fall in a range
flights %>% filter(between(dep_time, 600, 605))

# side note: is.na() can also be useful when filtering
flights %>% filter(!is.na(dep_time))
# slice() filters rows by position
flights %>% slice(1000:1005)

# keep the first three rows within each group
flights %>%
  group_by(month, day) %>%
  slice(1:3)

# sample three rows from each group
flights %>%
  group_by(month, day) %>%
  sample_n(3)

# keep three rows from each group with the top dep_delay
flights %>%
  group_by(month, day) %>%
  top_n(3, dep_delay)

# also sort by dep_delay within each group
flights %>%
  group_by(month, day) %>%
  top_n(3, dep_delay) %>%
  arrange(desc(dep_delay))
# unique rows can be identified using unique() from base R
flights %>%
  select(origin, dest) %>%
  unique()
# dplyr provides an alternative that is more "efficient"
flights %>%
  select(origin, dest) %>%
  distinct()

# side note: when chaining, you don't have to include the parentheses if there are no arguments
flights %>%
  select(origin, dest) %>%
  distinct()

Adding new variables: mutate, transmute, add_rownames

# mutate() creates a new variable (and keeps all existing variables)
flights %>% mutate(speed = distance / air_time * 60)

# transmute() only keeps the new variables
flights %>% transmute(speed = distance / air_time * 60)
# example data frame with row names
mtcars %>% head()

# add_rownames() turns row names into an explicit variable
mtcars %>%
  add_rownames("model") %>%
  head()

# side note: dplyr no longer prints row names (ever) for local data frames
mtcars %>% tbl_df()

Grouping and counting: summarise, tally, count, group_size, n_groups, ungroup

# summarise() can be used to count the number of rows in each group
flights %>%
  group_by(month) %>%
  summarise(cnt = n())
# tally() and count() can do this more concisely
flights %>%
  group_by(month) %>%
  tally()
flights %>% count(month)
# you can sort by the count
flights %>%
  group_by(month) %>%
  summarise(cnt = n()) %>%
  arrange(desc(cnt))
# tally() and count() have a sort parameter for this purpose
flights %>%
  group_by(month) %>%
  tally(sort = TRUE)
flights %>% count(month, sort = TRUE)
# you can sum over a specific variable instead of simply counting rows
flights %>%
  group_by(month) %>%
  summarise(dist = sum(distance))
# tally() and count() have a wt parameter for this purpose
flights %>%
  group_by(month) %>%
  tally(wt = distance)
flights %>% count(month, wt = distance)
# group_size() returns the counts as a vector
flights %>%
  group_by(month) %>%
  group_size()

# n_groups() simply reports the number of groups
flights %>%
  group_by(month) %>%
  n_groups()
# group by two variables, summarise, arrange (output is possibly confusing)
flights %>%
  group_by(month, day) %>%
  summarise(cnt = n()) %>%
  arrange(desc(cnt)) %>%
  print(n = 40)

# ungroup() before arranging to arrange across all groups
flights %>%
  group_by(month, day) %>%
  summarise(cnt = n()) %>%
  ungroup() %>%
  arrange(desc(cnt))

Creating data frames: data_frame

data_frame() is a better way than data.frame() for creating data frames. Benefits of data_frame():

  • You can use previously defined columns to compute new columns.
  • It never coerces column types.
  • It never munges column names.
  • It never adds row names.
  • It only recycles length 1 input.
  • It returns a local data frame (a tbl_df).
# data_frame() example
data_frame(a = 1:6, b = a * 2, c = "string", "d+e" = 1) %>% glimpse()

# data.frame() example
data.frame(a = 1:6, c = "string", "d+e" = 1) %>% glimpse()

Viewing more output: print, View

# specify that you want to see more rows
flights %>% print(n = 15)
# specify that you want to see ALL rows (don't run this!)
flights %>% print(n = Inf)
# specify that you want to see all columns
flights %>% print(width = Inf)
# show up to 1000 rows and all columns
flights %>% View()

# set option to see all columns and fewer rows
options(dplyr.width = Inf, dplyr.print_min = 6)

# reset options (or just close R)
options(dplyr.width = NULL, dplyr.print_min = 10)

plot

library(ggplot2)

flights %>%
  group_by(dest) %>%
  summarize(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(delay, count > 20, dest != "HNL") %>%
  ggplot(mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1 / 3) +
  geom_smooth(se = FALSE)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值