20220603_R数据整理_toothgrowth下载-CSDN博客

本文链接：https://blog.csdn.net/weixin_43634298/article/details/125110979

本文介绍了R语言中使用dplyr包进行数据操作，包括新增变量、筛选行和列、分组计算等。同时展示了如何使用管道操作符 `%>%` 提高代码可读性，并通过tidyr包进行数据框的合并与拆分，以及宽长数据转换。此外，还演示了数据框的连接操作，包括左连接、右连接、全连接和内连接。

摘要由CSDN通过智能技术生成

# dplyr，很多包没有下载，先安装
library(dplyr)

head(ToothGrowth)

str(ToothGrowth)

# 新增变量和变量重新赋值

toothgrowth2 <- mutate(ToothGrowth, # 用自己的数据试试

len = len^2,

nv = 1:nrow(ToothGrowth),

nv2 = ifelse(nv > median(nv), "H", "L")

head(toothgrowth2)

# 筛选行（样本）

toothgrowth3 <- filter(toothgrowth2,

nv %in% 1:50,

nv == "H"

toothgrowth3

# 筛选列（样本）

toothgrowth4 <- select(toothgrowth3,

c(2,4))

head(toothgrowth4)

# 分组计算

summarise(ToothGrowth, len_max = max(len), len_min = min(len))

summarise(group_by(ToothGrowth, supp), len_max = max(len))

summarise(group_by(ToothGrowth, dose), len_max = max(len))

summarise(group_by(ToothGrowth, dose, supp), len_max = max(len))

# 管道操作符 ,快捷键ctrl+shift+M,为了什么？

library(magritter)

ToothGrowth %>%

mutate(nv = 1:nrow(ToothGrowth)) %>%

filter(nv %in% 1:50) %>%

select(1:2) %>%

group_by(supp) %>%

summarise(len_max = max(len)) %>%

as.data.frame()

# 连接/合并数据框
library(dplyr)
df1 <- data.frame(c1 = 2:5,
                             c2 = LETTERS[2:5])
df1
df2 <- data.frame(c3 = LETTERS[c(2:3,20:23)],
                             c4 = sample(1:100, size = 6))
df2

# 左连接
left_join(df1, df2, by = c('c2' = 'c3'))
df1 %>% left_join(df2, by = c('c2' = 'c3'))

# 右连接
df1 %>% right_join(df2, by = c('c2' = 'c3'))

# 全连接
df1 %>% full_join(df2, by = c('c2' = 'c3'))

# 内连接
df1 %>% inner_join(df2, by = c('c2' = 'c3'))

# 列的分裂与合并
library(tidyr)
# 分裂
df3 <- data.frame(c5 = paste(letters[1:3], 1:3, sep = "-"),
                             c6 = paste(letters[1:3], 1:3, sep = "."),
                             c4 = c("B", "B", "B"),
                             c3 = c("H", "M", "L"))
df3
df4 <- df3 %>%
  separate(col = c5, sep = "-", into = c("c7", "c8"), remove = F) %>%
  separate(col = c6, sep = "\\.", into = c("c9", "c10"), remove = T)
df4

# 合并
df4 %>%
  unite(col = "c11", c("c7", "c8"), sep = "_", remove = F) %>%
  unite(col = "c12", c("c9", "c10"), sep = ".", remove = T) %>%
  unite(col = "c13", c("c4", "c3"), sep = "", remove = F)

# 长宽数据转换
library(tidyr)

# 宽数据转长数据
set.seed(42) # 任意整数
df5 <- data.frame(time = rep(2011:2013, each=3),
area = rep(letters[1:3], times=3),
pop = sample(100:1000, 9),
den = round(rnorm(9, mean = 3, sd = 0.1), 2),
mj = sample(8:12, 9, replace = T))
df5

df6 <- df5 %>%
pivot_longer(cols = -c(1:2),
names_to = "varb",
values_to = "value")
df6

# 长数据转宽数据，行列转置吗？
df6 %>%
pivot_wider(names_from = c(area, varb),
values_from = value)