R语言学习_tidyverse_dplyr常用命令大全及示例

wjgyy808

已于 2023-10-18 08:39:34 修改

阅读量240

点赞数

文章标签： r语言学习

于 2023-10-04 18:13:34 首次发布

本文链接：https://blog.csdn.net/wjgyy808/article/details/133557821

版权

本文用到的data: gapminder，R中有对应的包，可以install、library。

命令概览

数据基础处理

glimpse

glimpse()对dataframe基本呈现，包括变量名、变量的前几个结果。

select

选择想要的变量。

gapminder %>%
	select(year, country)
#选择包含work的变量
gapminder %>%
	select(year, country,coutains("work"))
#选择不包含year的所有其他变量
gapminder %>%
	select(-year)

其他select中用到的命令
contains()
starts_with()
ends_with()
last_col()
matches()

filter

filter示意图

#筛选某年份某国家的数据
library(gapminder)
library(dplyr)

gapminder %>%
	filter(year == 2002) %>%
		filter(country == "China")

arrange

arrange示意图

#ascending
gapminder %>% 
	arrange(gdpPrecap)
	
#descending
gapminder %>% 
	arrange(desc(gdpPrecap))

mutate

mutate示意图

改变一个变量 change a variable

变量名不变，即改变了原变量名下的内容。

gapminder %>% 
    mutate(pop = pop/10000)

增加一个变量 add a new variable

变量名是新的

gapminder %>% 
    mutate(gdp = gdpPercap * pop)

合用

gapminder %>% 
	mutate(gdp = gdpPercap * pop) %>%
    	filter(year == 2007) %>%
        	arrange(desc(gdp))

变量rename

# 通过mutate
gapminder %>% 
	mutate(gdpPc = gdpPercap)

# 通过select的时候直接rename
gapminder %>% 
	select(country, gdpPc = gdpPercap)

# rename
gapminder %>% 
	rename(gdpPc = gdpPercap)

transmute

select()与mutate()的联合。
选择想要的变量，并且计算新的变量。

counties %>%
  # Keep the state, county, and populations columns, and add a density column
	  transmute(state, county, population, density = population/land_area) %>%
  # Filter for counties with a population greater than one million 
	  filter(population > 1000000) %>%
  # Sort density in ascending order 
	  arrange(density)

几个变量处理命令的比较

数据处理的命令分析
select和transmute都是从数据库中选择想要的。而rename和mutate都不会更改数据库中原来的变量。
select和rename都不能更改变量的具体value。而transmute和mutate通过计算能够更改变量的具体value。

# Change the name of the unemployment column
counties %>%
  rename(unemployment_rate = unemployment)

# Keep the state and county columns, and the columns containing poverty
counties %>%
  select(state, county, contains("poverty"))

# Calculate the fraction_women column without dropping the other columns
counties %>%
  mutate(fraction_women = women / population)

# Keep only the state, county, and employment_rate columns
counties %>%
  transmute(state, county, employment_rate = employed / population)

数据基本探讨分析

count

汇总数据的一种方法是对其进行计数：找出观测值的数量。这方面的dplyr动词是count（）
count可以排序，或者设置权重变量。

#每个国家的pop（排序sort）,设置变量权重(wt)
gapminder %>% 
	count(country, wt = pop, sort=T, wt=n)

summarize

summarize图片说明
可以应用各种计算，包括sum，mean，median，max，min，n 等。
默认情况下是汇总计算总的结果

gapminder %>% 
	summarize(meanlifeExp= mean(lifeExp))

输出结果是所有country所有year的平均值，仅一个值。

分类计算

gapminder %>% 
	filter( year= 2007) %>%
		summarize(meanlifeExp= mean(lifeExp))

meanlifeExp
67.007

计算不同变量

lifeExp以及对应的pop同时计算

gapminder %>% 
	filter( year= 2007) %>%
		summarize(meanlifeExp= mean(lifeExp),  totalPop = sum(pop))

计算结果

meanlifeExp totalPop
67.007 6251013179

group_by

group_by示意图

以group的形式呈现结果。默认升序。

gapminder %>% 
	group_by(year，continent) %>%
		summarize(meanlifeExp= mean(lifeExp),  totalPop = sum(pop))

多命令混合运用尝试

counties_selected %>%
  group_by(state) %>%
  summarize(total_area = sum(land_area), total_population = sum(population)) %>%    #按照state计算面积和人口
# Add a density column
   mutate(density = total_population/total_area) %>%
# Sort by density in descending order
  arrange(desc(density))

注意事项

counties_selected %>%
#Group and summarize to find the total population
  group_by(region, state) %>%
  summarize(total_pop = sum(population)) %>%
#Calculate the average_pop and median_pop columns 
  ungroup(state) %>%
#此时前面步骤的summarise已经得到一个titble，按照新tible的变量计算，用total_pop，而不是population
  summarise(average_pop = mean(total_pop),    
            median_pop = median(total_pop))

slice_max slice_min

提取max或者min的情况，可以通过n设置提取的数量。

#每个region一个排名最高的walk的比例
counties_selected %>%
  group_by(region) %>%
  slice_max(walk, n=1)

数据合并

inner_join

#将setB与setA合并，识别变量是idA和idB，相同变量名时，加相应的AB后缀。
setA <- setA %>% inner_join(setB, by = c('idA' = 'idB'), suffix = c('_A', '_B'))

合并三个及以上的df
合并变量示例

sets %>%
  inner_join(inventories, by = "set_num") %>%
  inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
  inner_join(colors, by = c('color_id' = 'id'), suffix = c('_set', '_color'))

left_join

保留前者的所有观测
left_join与inner_join

right_join

保留后者的所有观测。
left_join与right_join

full_join

两个数据库的观测均保留。
full_join示例

数据库与自己合并

themes数据库情况。

有id列和parent_id列。考虑到这一点，可以将主题表连接到自身，以确定不同主题存在的父子关系。

themes %>% 
  # Inner join the themes table
  inner_join(themes, by = c('id'='parent_id'), suffix = c('_parent','_child')) %>%
  # Filter for the "Harry Potter" parent name 
  filter(name_parent=='Harry Potter')

合并后的数据库
再次内部联接到自身，从而确定child与grandchild的关系。

# Join themes to itself again to find the grandchild relationships
themes %>% 
  inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
  inner_join(themes, by = c('id_child' = 'parent_id'), suffix = c("_parent", "_grandchild"))

其他

replace_na

parts %>%
  right_join(part_categories, by = c("part_cat_id" = "id")) %>%
#用replace_na将缺失值替换掉
  replace_na(list(var1=0,
  							var2=0))