生信小白菜之关于arrange、select函数的一切

羊&鹿

于 2024-03-31 15:50:57 发布

阅读量539

点赞数 10

文章标签：数据库 r语言笔记数据分析

本文链接：https://blog.csdn.net/yangyulu1998/article/details/137201994

版权

arrange() 基本用法

# 与filter()用法几乎一样
# 作用是改变行的顺序
# 第一个参数是数据框，第二个及以后的参数是作为排序依据的列名
## 后面的列就在前面排序的基础上继续排
## 例如
arrange(flights,year,month,day)
# 默认为升序，即从小到大
# 改为降序则
arrange(flights,desc(day)) # `desc()` 只能有一个确切的参数

arrange排序时缺失值永远排在最后

# 无论升/降序
arrange(flights,arr_time)
tail(arrange(flights,arr_time)) # 取尾巴几行
arrange(flights,desc(arr_time))
tail(arrange(flights,desc(arr_time))) # 一样的结果，取出来的都是NA值

# 那么如何使缺失值排到最前面呢
arrange(flights,!is.na(arr_time)) # TRUE、FALSE值逆转

arrange练习题

# 找出延误时间最长的航班
arrange(flights,desc(arr_delay))
# 找出出发时间最早的航班
arrange(flights,dep_time)
# 找出速度最快的航班
arrange(flights,air_time)
# 哪个航班飞行时间最长？
arrange(flights,desc(air_time))

select()函数基本用法

# 作用是筛选列，基于变量名称进行操作

# 和前两个一样，它们的工作方式都是相通的
# 第一个参数是一个数据框
# 随后的参数使用变量名称（不用带引号）
# 输出结果是一个新数据框

# 例如
## 选择"year"和"day"之间的所有列
select(flights,year:day)
## 选择不在"year"和"day"之间的所有列
select(flights,-(year:day))

select()配套一些辅助函数

starts_with("abc") # 匹配以abd开头的名称
ends_with("xyz") # 匹配以xyz结尾的名称
contains("ijk") # 匹配含有ijk的名称
matches("(.)\\1") # 匹配正则表达式
num_range("x",1:3) # 匹配x1、x2、x3

不用select()而用rename()重命名变量

# select()可以重命名变量，但很少这么用，因为这样容易丢失所有没有明确提及的变量
# 使用select()的变体rename()来重命名变量，以保留所有未明确提及的变量
rename(flights,tail_num=tailnum) 
View(flights) # 检查重新命名

结合select()和everything()移动变量

# 当你想把几个变量移动到数据框开头的位置
select(flights,time_hour,air_time,everything())

select()练习题

# 找出dep_time dep_delay arr_time arr_delay
## 1
select(flights,dep_time,dep_delay,arr_time,arr_delay)
## 2
select(flights,starts_with("dep"),starts_with("arr"))
## 3
select(flights,dep_time,dep_delay,arr_time,arr_delay,everything())[,1:4]
## 4
select(flights,4,6,7,9)
## 5
select(flights,c(4,6,7,9))
## 6 
...

# 如果在select()中多次计入一个变量名，会发生什么
select(flights,dep_time,dep_time) # 会自动忽略重复变量

# one_of()函数的作用是什么
vars <- c("year","day","dep_delay")
select(flights,one_of(vars)) 
select(flights,all_of(vars))

# 以下代码运行结果如何
select(flights,contains("TIME")) # 含有time的变量都被筛选出来了
## 辅助函数处理大小写的默认方式是什么？
starts_with(match, ignore.case = TRUE, vars = NULL) # matches()，可以是一个正则表达式，可以是 stringr 模式
ends_with(match, ignore.case = TRUE, vars = NULL) # ignore.caes默认为TRUE，则在匹配名称时忽略大小
contains(match, ignore.case = TRUE, vars = NULL) # 变量名字符向量。如果没有提供，变量将取自当前的选择上下文，由select()或pivot_longer()等函数建立
matches(match, ignore.case = TRUE, perl = FALSE, vars = NULL)
num_range(prefix, range, suffix = "", width = NULL, vars = NULL)

羊&鹿

关注

10
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
生信小白菜之关于arrange、select函数的一切

无论升/降序tail(arrange(flights,arr_time)) # 取尾巴几行tail(arrange(flights,desc(arr_time))) # 一样的结果，取出来的都是NA值# 那么如何使缺失值排到最前面呢is.na(arr_time)) # TRUE、FALSE值逆转。
复制链接

扫一扫