months <- c("Dec","Apr","Jan","Mar")


## [1] "Apr" "Dec" "Jan" "Mar"


## 首先创建我们想要的顺序,然后让变量遵从这个顺序
month_levels <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")

months1 <- factor(months, levels = month_levels)
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec


x1 <- c("Apr","Mar","Jan","Dee")
factor(x1, levels = month_levels)
## [1] Apr  Mar  Jan  <NA>
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec


factor(months, levels = unique(months))
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar

months %>% factor() %>% fct_inorder()
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar



1. 修改因子向量顺序


  • fct_relevel():手动调整顺序
  • fct_inorder()/fct_infreq()/fct_inseq():根据第一次出现的顺序、出现的频率多少、数字顺序进行排序
  • fct_reorder()/fct_recorder2()/last2()/first2():根据另外一个变量的值调整顺序
  • fct_shuffle():随机重新排列
  • fct_rev():反转因子水平
  • fct_shift():将因子向左或右移动

2. 修改因子向量名称


  • fct_anon():按照因素水平
  • fct_collapse():将因子水平折叠成手动定义的组
  • fct_lump()/fct_lump_min()/fct_lump_prop()/fct_lump_n()/fct_lump_lowfreq():将出现次数较少的合并为“其他”
  • fct_other():将指定的因子水平设置为“其他”
  • fct_recode():手动改变因子的值
  • fct_relabel():自动重新标记因子水平,必要时折叠

3. 增加/删除因子

  • fct_expand():
  • fct_explicit_na():使缺失值显式显示
  • fct_drop():
  • fct_unify():

4. 合并多个因子

  • fct_c():
  • fct_cross():


  • as_factor():
  • fct_count():
  • fct_match():
  • fct_unique():
  • lvls_reorder()/lvls_revalue()/lvls_expand():
  • lvls_union():

6. 一个数据集

  • gss_cat


1.1 fct_relevel()

## 创建一个因子型向量
f <- factor(c("a", "b", "c", "d"), levels = c("b", "c", "d", "a"))
## [1] a b c d
## Levels: b c d a
## 把c,d放在地第1位,第2位
fct_relevel(f, c("c", "d"))
## [1] a b c d
## Levels: c d b a
## 把`a`放在第3的水平
fct_relevel(f, "a", after = 2)
## [1] a b c d
## Levels: b c a d
# 把`a`放到最后的位置
fct_relevel(f, "a", after = Inf)
## [1] a b c d
## Levels: b c d a
## 按照某个函数重新排序
fct_relevel(f, sort)
## [1] a b c d
## Levels: a b c d
## 注意这时的顺序是按照`sort(c("a","b","c","d"))`,不是按照`sort(f)`
## 按照随机顺序
fct_relevel(f, sample)
## [1] a b c d
## Levels: a b c d

## 反转顺序
fct_relevel(f, rev)
## [1] a b c d
## Levels: a d c b

下面是一个看起来很复杂,其实不复杂的例子,使用的是内置数据:gss_cat,只选择其中的2列,我们的目标是把每一列中的Don't know放到最后。

## 先看下原来的因子水平
df  <- forcats::gss_cat[, c("rincome", "denom")]
lapply(df, levels) # 对df的每一列都使用`levels()`函数
## $rincome
##  [1] "No answer"      "Don't know"     "Refused"        "$25000 or more"
##  [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" 
##  [9] "$7000 to 7999"  "$6000 to 6999"  "$5000 to 5999"  "$4000 to 4999" 
## [13] "$3000 to 3999"  "$1000 to 2999"  "Lt $1000"       "Not applicable"
## $denom
##  [1] "No answer"            "Don't know"           "No denomination"     
##  [4] "Other"                "Episcopal"            "Presbyterian-dk wh"  
##  [7] "Presbyterian, merged" "Other presbyterian"   "United pres ch in us"
## [10] "Presbyterian c in us" "Lutheran-dk which"    "Evangelical luth"    
## [13] "Other lutheran"       "Wi evan luth synod"   "Lutheran-mo synod"   
## [16] "Luth ch in america"   "Am lutheran"          "Methodist-dk which"  
## [19] "Other methodist"      "United methodist"     "Afr meth ep zion"    
## [22] "Afr meth episcopal"   "Baptist-dk which"     "Other baptists"      
## [25] "Southern baptist"     "Nat bapt conv usa"    "Nat bapt conv of am" 
## [28] "Am bapt ch in usa"    "Am baptist asso"      "Not applicable"

可以看到每一列都有一个Don't know,我们要把它放到最后,顺便学习lapply的用法。

# 对df的每一列使用`fct_relevel(..., "Don't know", after = Inf)`
df2 <- lapply(df, fct_relevel, "Don't know", after = Inf) 

lapply(df2, levels) # 可以看到"Don't know"都被排在最后了
## $rincome
##  [1] "No answer"      "Refused"        "$25000 or more" "$20000 - 24999"
##  [5] "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"  "$7000 to 7999" 
##  [9] "$6000 to 6999"  "$5000 to 5999"  "$4000 to 4999"  "$3000 to 3999" 
## [13] "$1000 to 2999"  "Lt $1000"       "Not applicable" "Don't know"    
## $denom
##  [1] "No answer"            "No denomination"      "Other"               
##  [4] "Episcopal"            "Presbyterian-dk wh"   "Presbyterian, merged"
##  [7] "Other presbyterian"   "United pres ch in us" "Presbyterian c in us"
## [10] "Lutheran-dk which"    "Evangelical luth"     "Other lutheran"      
## [13] "Wi evan luth synod"   "Lutheran-mo synod"    "Luth ch in america"  
## [16] "Am lutheran"          "Methodist-dk which"   "Other methodist"     
## [19] "United methodist"     "Afr meth ep zion"     "Afr meth episcopal"  
## [22] "Baptist-dk which"     "Other baptists"       "Southern baptist"    
## [25] "Nat bapt conv usa"    "Nat bapt conv of am"  "Am bapt ch in usa"   
## [28] "Am baptist asso"      "Not applicable"       "Don't know"


fct_relevel(f, "e")
## Warning: Unknown levels in `f`: e
## [1] a b c d
## Levels: b c d a

1.2 fct_inorder()/fct_infreq()/fct_inseq()


  • fct_inorder(): 按照第一次出现的顺序

  • fct_infreq(): 按照每个水平出现的频率(从大到小)

  • fct_inseq(): 按照数字大小

f <- factor(c("b", "b", "a", "c", "c", "c"))
f #默认按字母顺序
## [1] b b a c c c
## Levels: a b c

fct_inorder(f) # 按第一次出现的顺序
## [1] b b a c c c
## Levels: b a c

fct_infreq(f) # 按出现的频率从大到小排列
## [1] b b a c c c
## Levels: c b a

f <- factor(1:3, levels = c("3", "2", "1"))
## [1] 1 2 3
## Levels: 3 2 1

fct_inseq(f) # 按照数字顺序排列,虽然你定义的顺序是"3", "2", "1"
## [1] 1 2 3
## Levels: 1 2 3




ggplot(starwars, aes(x = hair_color)) + 
  geom_bar() + 

plot of chunk unnamed-chunk-16


ggplot(starwars, aes(x = fct_infreq(hair_color))) +
  geom_bar() +

plot of chunk unnamed-chunk-17


1.3 fct_reorder()/fct_recorder2()/last2()/first2()


## 生成一个简单的tibble
df <- tibble::tribble(
  ~color,     ~a, ~b,
  "blue",      1,  2,
  "green",     6,  2,
  "purple",    3,  3,
  "red",       2,  3,
  "yellow",    5,  1

## 查看color这一列的顺序
df$color <- factor(df$color)
## [1] blue   green  purple red    yellow
## Levels: blue green purple red yellow


fct_reorder(df$color, df$a, min)
## [1] blue   green  purple red    yellow
## Levels: blue red purple yellow green


boxplot(Sepal.Width ~ Species, data = iris)

plot of chunk unnamed-chunk-20

boxplot(Sepal.Width ~ fct_reorder(Species, Sepal.Width), data = iris)

plot of chunk unnamed-chunk-20

boxplot(Sepal.Width ~ fct_reorder(Species, Sepal.Width, .desc = TRUE), data = iris)

plot of chunk unnamed-chunk-20

fct_reorder2(df$color, df$a, df$b)
## [1] blue   green  purple red    yellow
## Levels: purple red blue green yellow


chks <- subset(ChickWeight, as.integer(Chick) < 10)
chks <- transform(chks, Chick = fct_shuffle(Chick))
##     weight Time Chick Diet
## 85      42    0     8    1
## 86      50    2     8    1
## 87      61    4     8    1
## 88      71    6     8    1
## 89      84    8     8    1
## 90      93   10     8    1
## 91     110   12     8    1
## 92     116   14     8    1
## 93     126   16     8    1
## 94     134   18     8    1
## 95     125   20     8    1
## 96      42    0     9    1
## 97      51    2     9    1
## 98      59    4     9    1
## 99      68    6     9    1
## 100     85    8     9    1
## 101     96   10     9    1
## 102     90   12     9    1
## 103     92   14     9    1
## 104     93   16     9    1
## 105    100   18     9    1
## 106    100   20     9    1
## 107     98   21     9    1
## 108     41    0    10    1
## 109     44    2    10    1
## 110     52    4    10    1
## 111     63    6    10    1
## 112     74    8    10    1
## 113     81   10    10    1
## 114     89   12    10    1
## 115     96   14    10    1
## 116    101   16    10    1
## 117    112   18    10    1
## 118    120   20    10    1
## 119    124   21    10    1
## 144     41    0    13    1
## 145     48    2    13    1
## 146     53    4    13    1
## 147     60    6    13    1
## 148     65    8    13    1
## 149     67   10    13    1
## 150     71   12    13    1
## 151     70   14    13    1
## 152     71   16    13    1
## 153     81   18    13    1
## 154     91   20    13    1
## 155     96   21    13    1
## 168     41    0    15    1
## 169     49    2    15    1
## 170     56    4    15    1
## 171     64    6    15    1
## 172     68    8    15    1
## 173     68   10    15    1
## 174     67   12    15    1
## 175     68   14    15    1
## 176     41    0    16    1
## 177     45    2    16    1
## 178     49    4    16    1
## 179     51    6    16    1
## 180     57    8    16    1
## 181     51   10    16    1
## 182     54   12    16    1
## 183     42    0    17    1
## 184     51    2    17    1
## 185     61    4    17    1
## 186     72    6    17    1
## 187     83    8    17    1
## 188     89   10    17    1
## 189     98   12    17    1
## 190    103   14    17    1
## 191    113   16    17    1
## 192    123   18    17    1
## 193    133   20    17    1
## 194    142   21    17    1
## 195     39    0    18    1
## 196     35    2    18    1
## 209     41    0    20    1
## 210     47    2    20    1
## 211     54    4    20    1
## 212     58    6    20    1
## 213     65    8    20    1
## 214     73   10    20    1
## 215     77   12    20    1
## 216     89   14    20    1
## 217     98   16    20    1
## 218    107   18    20    1
## 219    115   20    20    1
## 220    117   21    20    1

ggplot(chks, aes(Time, weight, colour = Chick)) +
  geom_point() +

plot of chunk unnamed-chunk-22

# 图例的顺序和线的顺序一样
ggplot(chks, aes(Time, weight, colour = fct_reorder2(Chick, Time, weight))) +
  geom_point() +
  geom_line() +
  labs(colour = "Chick")

plot of chunk unnamed-chunk-22

1.4 fct_shuffle()


f <- factor(c("a", "b", "c"))
## [1] a b c
## Levels: a b c
fct_shuffle(f) # 每次运行都会出现不同的顺序,除非设置种子数
## [1] a b c
## Levels: b a c

1.5 fct_rev()


f <- factor(c("a", "b", "c"))
## [1] a b c
## Levels: a b c
## [1] a b c
## Levels: c b a

1.6 fct_shift()


x <- factor(
  c("Mon", "Tue", "Wed"),
  levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"),
  ordered = TRUE
## [1] Mon Tue Wed
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

## [1] Mon Tue Wed
## Levels: Mon < Tue < Wed < Thu < Fri < Sat < Sun

fct_shift(x, 2)
## [1] Mon Tue Wed
## Levels: Tue < Wed < Thu < Fri < Sat < Sun < Mon

fct_shift(x, -1)
## [1] Mon Tue Wed
## Levels: Sat < Sun < Mon < Tue < Wed < Thu < Fri

2.1 fct_anon()


gss_cat$relig %>% fct_count()
## # A tibble: 16 x 2
##    f                           n
##    <fct>                   <int>
##  1 No answer                  93
##  2 Don't know                 15
##  3 Inter-nondenominational   109
##  4 Native american            23
##  5 Christian                 689
##  6 Orthodox-christian         95
##  7 Moslem/islam              104
##  8 Other eastern              32
##  9 Hinduism                   71
## 10 Buddhism                  147
## 11 Other                     224
## 12 None                     3523
## 13 Jewish                    388
## 14 Catholic                 5124
## 15 Protestant              10846
## 16 Not applicable              0
gss_cat$relig %>% fct_anon() %>% fct_count()
## # A tibble: 16 x 2
##    f         n
##    <fct> <int>
##  1 01       32
##  2 02      224
##  3 03       93
##  4 04     3523
##  5 05      689
##  6 06     5124
##  7 07    10846
##  8 08      104
##  9 09      109
## 10 10      147
## 11 11       23
## 12 12       71
## 13 13      388
## 14 14        0
## 15 15       15
## 16 16       95
gss_cat$relig %>% fct_anon("X") %>% fct_count()
## # A tibble: 16 x 2
##    f         n
##    <fct> <int>
##  1 X01     109
##  2 X02    5124
##  3 X03     224
##  4 X04    3523
##  5 X05      95
##  6 X06       0
##  7 X07     689
##  8 X08      93
##  9 X09      32
## 10 X10     147
## 11 X11      15
## 12 X12      71
## 13 X13     388
## 14 X14     104
## 15 X15      23
## 16 X16   10846

2.2 fct_collapse()


## # A tibble: 10 x 2
##    f                      n
##    <fct>              <int>
##  1 No answer            154
##  2 Don't know             1
##  3 Other party          393
##  4 Strong republican   2314
##  5 Not str republican  3032
##  6 Ind,near rep        1791
##  7 Independent         4119
##  8 Ind,near dem        2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490


partyid2 <- fct_collapse(gss_cat$partyid,
                         missing = c("No answer", "Don't know"),
                         rep = c("Strong republican", "Not str republican"),
                         other = "Other party",
                         ind = c("Ind,near rep", "Independent", "Ind,near dem"),
                         dem = c("Not str democrat", "Strong democrat")
## # A tibble: 5 x 2
##   f           n
##   <fct>   <int>
## 1 missing   155
## 2 other     393
## 3 rep      5346
## 4 ind      8409
## 5 dem      7180

2.3 fct_lump()


  • fct_lump_min(): 把小于某些次数的归为其他类.

  • fct_lump_prop(): 把小于某个比例的归为其他类.

  • fct_lump_n(): 把个数最多的n个留下,其他的归为一类(如果n < 0,则个数最少的n个留下).

  • fct_lump_lowfreq(): 将最不频繁的级别合并在一起.

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()
## .
##  A  B  C  D  E  F  G  H  I 
## 40 10  5 27  1  1  1  1  1


x %>% fct_lump_n(3) %>% table() # ties.method = c("min", "average", "first", "last", "random", "max")
## .
##     A     B     D Other 
##    40    10    27    10


x %>% fct_lump_n(-3) %>% table()
## .
##     E     F     G     H     I Other 
##     1     1     1     1     1    82


x %>% fct_lump_prop(0.1) %>% table()
## .
##     A     B     D Other 
##    40    10    27    10


x %>% fct_lump_min(2, other_level = "其他") %>% table()
## .
##    A    B    C    D 其他 
##   40   10    5   27    5


x %>% fct_lump_lowfreq() %>% table()
## .
##     A     D Other 
##    40    27    20

2.4 fct_other()

把某些因子归为其他类,类似于 fct_lump

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))

# 把A,B留下,其他归为一类
fct_other(x, keep = c("A", "B"), other_level = "other")
##  [1] A     A     A     A     A     A     A     A     A     A     A     A    
## [13] A     A     A     A     A     A     A     A     A     A     A     A    
## [25] A     A     A     A     A     A     A     A     A     A     A     A    
## [37] A     A     A     A     B     B     B     B     B     B     B     B    
## [49] B     B     other other other other other other other other other other
## [61] other other other other other other other other other other other other
## [73] other other other other other other other other other other other other
## [85] other other other
## Levels: A B other

# 把A,B归为一类,其他留下
fct_other(x, drop = c("A", "B"), other_level = "hhahah")
##  [1] hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah
## [11] hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah
## [21] hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah
## [31] hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah
## [41] hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah hhahah
## [51] C      C      C      C      C      D      D      D      D      D     
## [61] D      D      D      D      D      D      D      D      D      D     
## [71] D      D      D      D      D      D      D      D      D      D     
## [81] D      D      E      F      G      H      I     
## Levels: C D E F G H I hhahah

2.5 fct_recode()


x <- factor(c("apple", "bear", "banana", "dear"))
## [1] apple  bear   banana dear  
## Levels: apple banana bear dear

fct_recode(x, fruit = "apple", fruit = "banana")
## [1] fruit bear  fruit dear 
## Levels: fruit bear dear
fct_recode(x, NULL = "apple", fruit = "banana")
## [1] <NA>  bear  fruit dear 
## Levels: fruit bear dear
fct_recode(x, "an apple" = "apple", "a bear" = "bear")
## [1] an apple a bear   banana   dear    
## Levels: an apple banana a bear dear
x <- factor(c("apple", "bear", "banana", "dear"))
levels <- c(fruit = "apple", fruit = "banana")
fct_recode(x, !!!levels)
## [1] fruit bear  fruit dear 
## Levels: fruit bear dear

2.6 fct_relable()

gss_cat$partyid %>% fct_count()
## # A tibble: 10 x 2
##    f                      n
##    <fct>              <int>
##  1 No answer            154
##  2 Don't know             1
##  3 Other party          393
##  4 Strong republican   2314
##  5 Not str republican  3032
##  6 Ind,near rep        1791
##  7 Independent         4119
##  8 Ind,near dem        2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490
gss_cat$partyid %>% fct_relabel(~ gsub(",", ", ", .x)) %>% fct_count()
## # A tibble: 10 x 2
##    f                      n
##    <fct>              <int>
##  1 No answer            154
##  2 Don't know             1
##  3 Other party          393
##  4 Strong republican   2314
##  5 Not str republican  3032
##  6 Ind, near rep       1791
##  7 Independent         4119
##  8 Ind, near dem       2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490

3.1 fct_expand()


f <- factor(sample(letters[1:3], 20, replace = TRUE))
##  [1] c b b a a a b a b b b a a c a c a a a b
## Levels: a b c

fct_expand(f, "d", "e", "f")
##  [1] c b b a a a b a b b b a a c a c a a a b
## Levels: a b c d e f

3.2 fct_drop()


f <- factor(c("a", "b"), levels = c("a", "b", "c"))
## [1] a b
## Levels: a b c

fct_drop(f, "c")
## [1] a b
## Levels: a b

3.3 fct_explicit_na()

NA 一个水平,确保画图或汇总的时候能用上

f1 <- factor(c("a", "a", NA, NA, "a", "b", NA, "c", "a", "c", "b"))
## # A tibble: 4 x 2
##   f         n
##   <fct> <int>
## 1 a         4
## 2 b         2
## 3 c         2
## 4 <NA>      3
f2 <- fct_explicit_na(f1, na_level = "missing")
## # A tibble: 4 x 2
##   f           n
##   <fct>   <int>
## 1 a           4
## 2 b           2
## 3 c           2
## 4 missing     3

3.4 fct_unify()


fs <- list(factor("a"), 
           factor(c("a", "b")))

fct_unify(fs, levels = c("a", "b", "c"))
## [[1]]
## [1] a
## Levels: a b c
## [[2]]
## [1] b
## Levels: a b c
## [[3]]
## [1] a b
## Levels: a b c

4.1 fct_c()


fa <- factor("a")
fb <- factor("b")
fab <- factor(c("a", "b"))

c(fa, fb, fab)
## [1] a b a b
## Levels: a b

fct_c(fa, fb, fab)
## [1] a b a b
## Levels: a b

4.2 fct_cross()


fruit <- factor(c("apple", "kiwi", "apple", "apple"))
colour <- factor(c("green", "green", "red", "green"))
eaten <- c("yes", "no", "yes", "no")

fct_cross(fruit, colour)
## [1] apple:green kiwi:green  apple:red   apple:green
## Levels: apple:green kiwi:green apple:red

fct_cross(fruit, colour, eaten)
## [1] apple:green:yes kiwi:green:no   apple:red:yes   apple:green:no 
## Levels: apple:green:no kiwi:green:no apple:green:yes apple:red:yes

5.1 as_factor()

变成因子向量,和 as.factor() 作用一样,但略有不同

x <- c("a", "z", "g")
as.factor(x) # 会改变顺序
## [1] a z g
## Levels: a g z
as_factor(x) # 还是按照原来的顺序
## [1] a z g
## Levels: a z g

5.2 fct_count()


f <- factor(sample(letters)[rpois(1000, 10)])
## f
##   a   b   d   e   g   h   i   j   k   l   m   n   o   q   r   t   u   v   x   y 
##  13   2  17   1  13  47  10   1 106  28 132  21  97  51  99  43   3 128   1  63 
##   z 
## 124
fct_count(f, sort = T, prop = T) # 计算个数,按顺序排列,并计算比例
## # A tibble: 21 x 3
##    f         n     p
##    <fct> <int> <dbl>
##  1 m       132 0.132
##  2 v       128 0.128
##  3 z       124 0.124
##  4 k       106 0.106
##  5 r        99 0.099
##  6 o        97 0.097
##  7 y        63 0.063
##  8 q        51 0.051
##  9 h        47 0.047
## 10 t        43 0.043
## # ... with 11 more rows

5.3 fct_match()


table(fct_match(gss_cat$marital, c("Married", "Divorced")))
##  7983 13500

5.4 fct_unique()


f <- factor(letters[rpois(100, 10)])
##  [1] i o j n p l k h q e f a m d g b
## Levels: a b d e f g h i j k l m n o p q

##  [1] a b d e f g h i j k l m n o p q
## Levels: a b d e f g h i j k l m n o p q

5.5 lvls_reorder()

f <- factor(c("a", "b", "c"))

lvls_reorder(f, 3:1)
## [1] a b c
## Levels: c b a

lvls_revalue(f, c("apple", "banana", "carrot"))
## [1] apple  banana carrot
## Levels: apple banana carrot

lvls_expand(f, c("a", "b", "c", "d"))
## [1] a b c
## Levels: a b c d

5.6 lvls_union()


fs <- list(factor("a"), factor("b"), factor(c("a", "b")))
## [1] "a" "b"







