dplyr入门命令
本文用到的data: gapminder,R中有对应的包,可以install、library。
数据基础处理
glimpse
glimpse()对dataframe基本呈现,包括变量名、变量的前几个结果。
select
选择想要的变量。
gapminder %>%
select(year, country)
#选择包含work的变量
gapminder %>%
select(year, country,coutains("work"))
#选择不包含year的所有其他变量
gapminder %>%
select(-year)
其他select中用到的命令
contains()
starts_with()
ends_with()
last_col()
matches()
filter
#筛选某年份某国家的数据
library(gapminder)
library(dplyr)
gapminder %>%
filter(year == 2002) %>%
filter(country == "China")
arrange
#ascending
gapminder %>%
arrange(gdpPrecap)
#descending
gapminder %>%
arrange(desc(gdpPrecap))
mutate
改变一个变量 change a variable
变量名不变,即改变了原变量名下的内容。
gapminder %>%
mutate(pop = pop/10000)
增加一个变量 add a new variable
变量名是新的
gapminder %>%
mutate(gdp = gdpPercap * pop)
合用
gapminder %>%
mutate(gdp = gdpPercap * pop) %>%
filter(year == 2007) %>%
arrange(desc(gdp))
变量rename
# 通过mutate
gapminder %>%
mutate(gdpPc = gdpPercap)
# 通过select的时候直接rename
gapminder %>%
select(country, gdpPc = gdpPercap)
# rename
gapminder %>%
rename(gdpPc = gdpPercap)
transmute
select()与mutate()的联合。
选择想要的变量,并且计算新的变量。
counties %>%
# Keep the state, county, and populations columns, and add a density column
transmute(state, county, population, density = population/land_area) %>%
# Filter for counties with a population greater than one million
filter(population > 1000000) %>%
# Sort density in ascending order
arrange(density)
几个变量处理命令的比较
select和transmute都是从数据库中选择想要的。而rename和mutate都不会更改数据库中原来的变量。
select和rename都不能更改变量的具体value。而transmute和mutate通过计算能够更改变量的具体value。
# Change the name of the unemployment column
counties %>%
rename(unemployment_rate = unemployment)
# Keep the state and county columns, and the columns containing poverty
counties %>%
select(state, county, contains("poverty"))
# Calculate the fraction_women column without dropping the other columns
counties %>%
mutate(fraction_women = women / population)
# Keep only the state, county, and employment_rate columns
counties %>%
transmute(state, county, employment_rate = employed / population)
数据基本探讨分析
count
汇总数据的一种方法是对其进行计数:找出观测值的数量。这方面的dplyr动词是count()
count可以排序,或者设置权重变量。
#每个国家的pop(排序sort),设置变量权重(wt)
gapminder %>%
count(country, wt = pop, sort=T, wt=n)
summarize
可以应用各种计算,包括sum,mean,median,max,min,n 等。
默认情况下是汇总计算总的结果
gapminder %>%
summarize(meanlifeExp= mean(lifeExp))
输出结果是所有country所有year的平均值,仅一个值。
分类计算
gapminder %>%
filter( year= 2007) %>%
summarize(meanlifeExp= mean(lifeExp))
meanlifeExp
67.007
计算不同变量
lifeExp以及对应的pop同时计算
gapminder %>%
filter( year= 2007) %>%
summarize(meanlifeExp= mean(lifeExp), totalPop = sum(pop))
计算结果
meanlifeExp totalPop
67.007 6251013179
group_by
以group的形式呈现结果。默认升序。
gapminder %>%
group_by(year,continent) %>%
summarize(meanlifeExp= mean(lifeExp), totalPop = sum(pop))
多命令混合运用尝试
counties_selected %>%
group_by(state) %>%
summarize(total_area = sum(land_area), total_population = sum(population)) %>% #按照state计算面积和人口
# Add a density column
mutate(density = total_population/total_area) %>%
# Sort by density in descending order
arrange(desc(density))
注意事项
counties_selected %>%
#Group and summarize to find the total population
group_by(region, state) %>%
summarize(total_pop = sum(population)) %>%
#Calculate the average_pop and median_pop columns
ungroup(state) %>%
#此时前面步骤的summarise已经得到一个titble,按照新tible的变量计算,用total_pop,而不是population
summarise(average_pop = mean(total_pop),
median_pop = median(total_pop))
slice_max slice_min
提取max或者min的情况,可以通过n设置提取的数量。
#每个region一个排名最高的walk的比例
counties_selected %>%
group_by(region) %>%
slice_max(walk, n=1)
数据合并
inner_join
#将setB与setA合并,识别变量是idA和idB,相同变量名时,加相应的AB后缀。
setA <- setA %>% inner_join(setB, by = c('idA' = 'idB'), suffix = c('_A', '_B'))
合并三个及以上的df
sets %>%
inner_join(inventories, by = "set_num") %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
inner_join(colors, by = c('color_id' = 'id'), suffix = c('_set', '_color'))
left_join
保留前者的所有观测
right_join
保留后者的所有观测。
full_join
两个数据库的观测均保留。
数据库与自己合并
themes数据库情况。
有id列和parent_id列。考虑到这一点,可以将主题表连接到自身,以确定不同主题存在的父子关系。
themes %>%
# Inner join the themes table
inner_join(themes, by = c('id'='parent_id'), suffix = c('_parent','_child')) %>%
# Filter for the "Harry Potter" parent name
filter(name_parent=='Harry Potter')
再次内部联接到自身,从而确定child与grandchild的关系。
# Join themes to itself again to find the grandchild relationships
themes %>%
inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
inner_join(themes, by = c('id_child' = 'parent_id'), suffix = c("_parent", "_grandchild"))
其他
replace_na
parts %>%
right_join(part_categories, by = c("part_cat_id" = "id")) %>%
#用replace_na将缺失值替换掉
replace_na(list(var1=0,
var2=0))