dplyr包功能（数据清理、过滤、合并R实现）

最新推荐文章于 2024-09-21 10:47:48 发布

一个人旅行*-*

最新推荐文章于 2024-09-21 10:47:48 发布

阅读量2.8k

点赞数

分类专栏： R语言文章标签： r语言数据挖掘 dplyr 数据清理

原文链接：https://www.r-bloggers.com/2017/07/useful-dplyr-functions-wexamples/

版权

R语言专栏收录该内容

116 篇文章 217 订阅

订阅专栏

R软件包dplyr用于数据清理，处理，可视化和分析，包含了很多有用的功能，与ggplot2,reshape2并列为数据分析及可视化的三大包之一。

select() 
filter()
mutate() 
group_by() 
summarise()
arrange() 
join()

示例数据

require(dplyr)

# Data file
file <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Some sensible variable names
df_names <- c("age", "wrkclass", "fnlweight", "education_lvl", "edu_score",
 "marital_status", "occupation", "relationship", "ethnic", "gender",
 "cap_gain", "cap_loss", "hrs_wk", "nationality", "income")

# Import the data
df <- read.csv(file, header = F,
 sep = ",",
 na.strings = c(" ?", " ", ""),
 row.names = NULL,
 col.names = df_names)

dplyr中的许多数据操作任务都可以在前向管道运算符（％>％）的帮助下执行。该管道最初是在Magrittr软件包中引入的，此后已包含在dplyr软件包中。对于流体数据的操作，它是一个非常有用的工具，可产生高度可读的代码。普查数据集需要一些预处理才能准备好用于分类算法。这篇文章不涉及预处理，也不包括预测建模。

去除重复项

#去除重复行
df %>% distinct() %>% nrow()

# 删除重复的行并分配给新的dataframe对象
df_clean <- df %>% distinct()

# 根据一个或多个变量删除重复项
df %>% distinct(gender, .keep_all = T)
df %>% distinct(gender, education_lvl, .keep_all =  T)

选取随机样本

# 抽样随机行，替换或不替换
sample_n(df, size = nrow(df) * 0.7, replace = F)
sample_n(df, size = 20, replace = T)

# 抽样一定比例的行，有无替换
sample_frac(df, size = 0.7, replace = F)
sample_frac(df, size = 0.8, replace = T

变量重命名

# Rename one or more variables in a dataframe
df <- df %>%
 rename("INCOME" = "income")

df <- df %>%
 rename("INCOME" = "income", "AGE" = "age")

select()函数

# 选择特定的列（INCOME是先前的新名称）
df %>%
 select(education_lvl, INCOME)
 
# 在dplyr 0.7.0中，pull（）函数将变量提取为向量
df %>%
 pull(age)

# 使用-运算符删除列（可通过名称或列位置引用变量）
df %>%
 select(-edu_score)

df %>%
 select(-1, -4)

df %>%
 select(-c(2:6))

有些功能可与select函数结合，下面为一些示例

# 选择名称以“ e”开头的列
df %>%
 select(starts_with("e"))

# -号也适用
df %>%
 select(-starts_with("e"))

# 选取包含特定名称的列
df %>%
 select(contains("edu"))

# 按照某一列重排序
df %>%
 select(INCOME, everything())

# 选取以某名称结尾的列
df %>%
 select(ends_with("e"))

df %>%
 select(ends_with("_loss"))

filter()函数

# 过滤行以保留年龄大于30的观察值
df %>%
 filter(age > 30)

# 使用％in％运算符按多个条件过滤（确保字符串匹配）
df %>%
 filter(relationship %in% c(" Unmarried", " Wife"))

# 可以使用OR 运算符 (|)
df %>%
 filter(relationship == " Husband" | relationship == " Wife")

# 适用 AND 运算符
df %>%
 filter(age > 30 & INCOME == " >50K")

# 两者结合
df %>%
 filter(education_lvl %in% c(" Doctorate", " Masters") & age > 30)

# 非
df %>%
 filter(education_lvl != " Doctorate")

# grepl（）函数可以与filter（）一起使用
df %>%
 filter(grepl(" Wi", relationship))

summarise()函数

动态汇总数据组甚至管道组，以进行ggplot数据可视化。

# 汇总分组数据
df %>%
 filter(INCOME == " >50K") %>%
 summarise(mean_age = mean(age),
           median_age = median(age),
           sd_age = sd(age))

# 汇总多个变量使用summarise_at()
df %>%
 filter(INCOME == " >50K") %>%
 summarise_at(vars(age, hrs_wk),
 funs(n(), 
      mean, 
      median))

#  . 代表所有变量
df %>%
 summarise_at(vars(age, hrs_wk),
 funs(n(),
      missing = sum(is.na(.)),
      mean = mean(., na.rm = T)))

# 使用匿名函数创建新的摘要统计信息
df %>%
 summarise_at(vars(age),
 function(x) { sum((x - mean(x)) / sd(x)) })

# 使用summarise_if（）有条件地进行汇总
df %>%
 filter(INCOME == " >50K") %>%
 summarise_if(is.numeric,
              funs(n(),
                   mean,
                   median))
 
# 选择数值型变量，并使用summarise_all（）获取摘要统计信息
ints <- df[sapply(df, is.numeric)]
summarise_all(ints,
              funs(mean, 
                   median, 
                   sd, 
                   var))

arrange()函数

升序或降序排列（默认升序）

# 按年龄升序
df %>%
 arrange(age) %>%
 head(10)

# 按年龄降序
df %>%
 arrange(desc(age)) %>%
 head(10)

group_by() 函数

#适用于常规数据分析
df %>%
 group_by(gender) %>%
 summarise(Mean = mean(age))

df %>%
 group_by(relationship) %>%
 summarise(total = n())

df %>%
 group_by(relationship) %>%
 summarise(total = n(),
           mean_age = mean(age))

mutate()函数

mutate（）用于从现有的局部变量或全局对象创建新变量。也可以在mutate（）中指定新变量，例如序列。

# 从已知变量中构建新变量
df %>% 
 mutate(norm_age = (age - mean(age)) / sd(age))
 

# 将每个数字元素乘以1000（将名称“ new”添加到原始变量名称中）
df %>%
 mutate_if(is.numeric,
           funs(new = (. * 1000))) %>%
           head()

join()函数

join（）用于合并来自共同ID或其他一些常见变量的不相交表的行。 join有很多变体，通常使用的是 left, right, inner and full joins.

# 构建用于键值的ID
df <- df %>%
 mutate(ID = seq(1:nrow(df))) %>%
 select(ID, everything())

# 生成两个表(部分重叠)
table_1 <- df[1:50 , ] %>%
 select(ID, age, education_lvl)

table_2 <- df[26:75 , ] %>%
 select(ID, gender, INCOME)

# left join()将表2中的行联接到表1中（方向在参数顺序中是隐式的）
left_join(table_1, table_2, by = "ID")

# Right join 将表2中的行联接到表1中
right_join(table_1, table_2, by = "ID")

# inner join仅保留完整case
inner_join(table_1, table_2, by = "ID")

# Full join 联接并保留所有的case
full_join(table_1, table_2, by = "ID"

以上总结了dplyr的一些出色功能。有关函数及其参数的更多信息，请使用模板查看帮助文档：

References

Hadley Wickham, Romain Francois, Lionel Henry and Kirill Müller (2017). dplyr: A
Grammar of Data Manipulation. R package version 0.7.0.
https://CRAN.R-project.org/package=dplyr

H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New
York, 2009.