【R】tidyr & purrr - 数据重塑与变量循环，帮助你提高数据清洗效率-CSDN博客

前言

tidyr - 用于数据集的长宽表转换、分列等形状重塑（reshape）

purrr - 实现了对变量数据的快速循环，让我们可以在对数据集内列变量进行循环时摒弃缓慢的for循环，提高数据处理效率与代码简洁性

数据准备

> pacman::p_load(data.table, tidyverse) #也可以直接library
> data <- fread("D://contest//transactions.csv")


> #将数据框转换为tibble形式数据框，为dplyr函数加快处理速度
> data <- tbl_df(data)


> #查看行列信息
> names(data)
 [1] "authorized_flag"      "card_id"              "city_id"              "category_1"           "installments"         "category_3"          
 [7] "merchant_category_id" "merchant_id"          "month_lag"            "purchase_amount"      "purchase_date"        "category_2"          
[13] "state_id"             "subsector_id"        


> dim(data)
[1] 185129     14
#185129行，14列


> data
# A tibble: 185,129 x 14
   authorized_flag card_id     city_id category_1 installments category_3 merchant_category~ merchant_id   month_lag purchase_amount purchase_date    category_2 state_id subsector_id
   <chr>           <chr>         <int> <chr>             <int> <chr>                   <int> <chr>             <int>           <dbl> <chr>                 <int>    <int>        <int>
 1 Y               C_ID_cb34e~      20 N                     0 A                         422 M_ID_f162748~         1          -0.660 2018-03-07 10:5~          3       19           27
 2 Y               C_ID_8a118~     251 N                     0 A                         278 M_ID_00c57ea~         2          -0.624 2017-07-21 12:1~          3        8           37
 3 Y               C_ID_69ed7~     117 N                     0 A                         367 M_ID_1da88fb~         2          -0.714 2017-10-27 20:2~          4       13           16
 4 Y               C_ID_ae6f8~      69 N                     1 B                         383 M_ID_ebbdb42~         2          -0.446 2018-01-26 08:4~          1        9            2
 5 Y               C_ID_07b21~     277 N                     0 A                         278 M_ID_7291e8a~         1          -0.736 2018-03-10 21:4~          4       13           37
 6 Y               C_ID_34521~     291 N                     0 A                         422 M_ID_0649e6e~         2          -0.702 2018-03-11 19:3~          1        9           27
 7 Y               C_ID_5d09f~      69 N                     0 A                         511 M_ID_b794b9d~         2          -0.183 2018-04-21 13:1~          1        9            7
 8 Y               C_ID_a12f5~     331 N                     0 A                         273 M_ID_73d19e5~         1          -0.720 2018-03-21 06:5~          3        3           20
 9 Y               C_ID_88fb2~      69 N                     0 A                          19 M_ID_a79f97e~         1          -0.726 2018-03-01 16:5~          1        9           36
10 Y               C_ID_8ef5e~     261 N                     1 B                         823 M_ID_2059928~         2          -0.509 2018-03-04 03:5~          1        9           25
# ... with 185,119 more rows


#筛选出card_id, category_2, category_3三个列
#并取前10行赋值到category
> data %>% 
    select(card_id, category_2, category_3) %>% 
    slice(1:10)-> category


#在category中插入一个缺失值
>  category[3, 3] <- NA


> category
# A tibble: 10 x 3
   card_id         category_2 category_3
   <chr>                <int> <chr>     
 1 C_ID_cb34e13a1e          3 A         
 2 C_ID_8a1184a21e          3 A         
 3 C_ID_69ed7c9177          4 NA        
 4 C_ID_ae6f8dd403          1 B         
 5 C_ID_07b2107875          4 A         
 6 C_ID_34521fce1b          1 A         
 7 C_ID_5d09feb8b0          1 A         
 8 C_ID_a12f57ce14          3 A         
 9 C_ID_88fb23a843          1 A         
10 C_ID_8ef5e8c067          1 B

我们以一个kaggle上的信用卡消费数据集为例

百度网盘下载：

链接：https://pan.baidu.com/s/1Qv3nAJxfo7hxjdTOGoLOXA

提取码：f4ks

tidyr

转长表 - gather

#在对数据集category进行转长表操作时，要指定以哪些列为“key”

#这里指定category_3, category_2两列为key，重塑后数据集形状如下

> category %>% 
    gather(category_3, category_2, key = "category", value = "cases") -> category_long


> category_long
# A tibble: 20 x 3
   card_id         category   cases
   <chr>           <chr>      <chr>
 1 C_ID_cb34e13a1e category_3 A    
 2 C_ID_8a1184a21e category_3 A    
 3 C_ID_69ed7c9177 category_3 NA   
 4 C_ID_ae6f8dd403 category_3 B    
 5 C_ID_07b2107875 category_3 A    
 6 C_ID_34521fce1b category_3 A    
 7 C_ID_5d09feb8b0 category_3 A    
 8 C_ID_a12f57ce14 category_3 A    
 9 C_ID_88fb23a843 category_3 A    
10 C_ID_8ef5e8c067 category_3 B    
11 C_ID_cb34e13a1e category_2 3    
12 C_ID_8a1184a21e category_2 3    
13 C_ID_69ed7c9177 category_2 4    
14 C_ID_ae6f8dd403 category_2 1    
15 C_ID_07b2107875 category_2 4    
16 C_ID_34521fce1b category_2 1    
17 C_ID_5d09feb8b0 category_2 1    
18 C_ID_a12f57ce14 category_2 3    
19 C_ID_88fb23a843 category_2 1    
20 C_ID_8ef5e8c067 category_2 1

#我们可以看到列category内罗列出了被指定的“key”

#cases列内的值为重塑前category_3, category_2列包含的值

#同时，card_id也自动匹配并延展为对应长度

转宽表 - spread

#对应的，指定category为“key”，cases为“value”进行转宽表

> category_long %>% 
    spread(category, cases)
    
# A tibble: 10 x 3
   card_id         category_2 category_3
   <chr>           <chr>      <chr>     
 1 C_ID_07b2107875 4          A         
 2 C_ID_34521fce1b 1          A         
 3 C_ID_5d09feb8b0 1          A         
 4 C_ID_69ed7c9177 4          NA        
 5 C_ID_88fb23a843 1          A         
 6 C_ID_8a1184a21e 3          A         
 7 C_ID_8ef5e8c067 1          B         
 8 C_ID_a12f57ce14 3          A         
 9 C_ID_ae6f8dd403 1          B         
10 C_ID_cb34e13a1e 3          A

分列 - separate

#将card_id列以"_"为区分进行分列，将分列后产生的3个列分别命名为："noneed", "noneed2", "id"，最后赋值到变量category_sep

> category %>% 
    separate(card_id, sep = "_", into = c("noneed", "noneed2", "id")) -> category_sep
    
> category_sep
# A tibble: 10 x 5
   noneed noneed2 id         category_2 category_3
   <chr>  <chr>   <chr>           <int> <chr>     
 1 C      ID      cb34e13a1e          3 A         
 2 C      ID      8a1184a21e          3 A         
 3 C      ID      69ed7c9177          4 NA        
 4 C      ID      ae6f8dd403          1 B         
 5 C      ID      07b2107875          4 A         
 6 C      ID      34521fce1b          1 A         
 7 C      ID      5d09feb8b0          1 A         
 8 C      ID      a12f57ce14          3 A         
 9 C      ID      88fb23a843          1 A         
10 C      ID      8ef5e8c067          1 B

变量合并 - unite

#将noneed, noneed2, id合并为一列，命名为"ID"，连接符号为":"

> category_sep %>% 
    unite(noneed, noneed2, id, col = "ID", sep = ":")
  
# A tibble: 10 x 3
   ID              category_2 category_3
   <chr>                <int> <chr>     
 1 C:ID:cb34e13a1e          3 A         
 2 C:ID:8a1184a21e          3 A         
 3 C:ID:69ed7c9177          4 NA        
 4 C:ID:ae6f8dd403          1 B         
 5 C:ID:07b2107875          4 A         
 6 C:ID:34521fce1b          1 A         
 7 C:ID:5d09feb8b0          1 A         
 8 C:ID:a12f57ce14          3 A         
 9 C:ID:88fb23a843          1 A         
10 C:ID:8ef5e8c067          1 B

分列同时转长表 - separate_rows

#这里直接对card_id列进行了根据"_"字符的分割后，直接以原列为“key”进行了长表转换，省去了单纯长表转换separate函数的新列命名过程。

> category %>% 
    separate_rows(card_id, sep = "_") #分列&转长表
    
# A tibble: 30 x 3
   card_id    category_2 category_3
   <chr>           <int> <chr>     
 1 C                   3 A         
 2 ID                  3 A         
 3 cb34e13a1e          3 A         
 4 C                   3 A         
 5 ID                  3 A         
 6 8a1184a21e          3 A         
 7 C                   4 NA        
 8 ID                  4 NA        
 9 69ed7c9177          4 NA        
10 C                   1 B         
# ... with 20 more rows

缺失值处理

这里引入tidyr包中的三个函数（刚才插入的NA值是为了这里的)：

#删除带有NA的行

> category %>% 
    drop_na()  
    
# A tibble: 9 x 3
  card_id         category_2 category_3
  <chr>                <int> <chr>     
1 C_ID_cb34e13a1e          3 A         
2 C_ID_8a1184a21e          3 A         
3 C_ID_ae6f8dd403          1 B         
4 C_ID_07b2107875          4 A         
5 C_ID_34521fce1b          1 A         
6 C_ID_5d09feb8b0          1 A         
7 C_ID_a12f57ce14          3 A         
8 C_ID_88fb23a843          1 A         
9 C_ID_8ef5e8c067          1 B

#使用相邻的值替换NA

> category %>% 
    fill(category_3) 
    
# A tibble: 10 x 3
   card_id         category_2 category_3
   <chr>                <int> <chr>     
 1 C_ID_cb34e13a1e          3 A         
 2 C_ID_8a1184a21e          3 A         
 3 C_ID_69ed7c9177          4 A         
 4 C_ID_ae6f8dd403          1 B         
 5 C_ID_07b2107875          4 A         
 6 C_ID_34521fce1b          1 A         
 7 C_ID_5d09feb8b0          1 A         
 8 C_ID_a12f57ce14          3 A         
 9 C_ID_88fb23a843          1 A         
10 C_ID_8ef5e8c067          1 B

#指定值替换NA

> category %>% 
    replace_na(list(category_3 = "X")) 
    
# A tibble: 10 x 3
   card_id         category_2 category_3
   <chr>                <int> <chr>     
 1 C_ID_cb34e13a1e          3 A         
 2 C_ID_8a1184a21e          3 A         
 3 C_ID_69ed7c9177          4 X         
 4 C_ID_ae6f8dd403          1 B         
 5 C_ID_07b2107875          4 A         
 6 C_ID_34521fce1b          1 A         
 7 C_ID_5d09feb8b0          1 A         
 8 C_ID_a12f57ce14          3 A         
 9 C_ID_88fb23a843          1 A         
10 C_ID_8ef5e8c067          1 B

purrr

#数据准备 - 我们只取data中的数值变量

>data %>% 
   select_if(is.numeric) -> data
  
> data
# A tibble: 185,129 x 8
   city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
     <int>        <int>                <int>     <int>           <dbl>      <int>    <int>        <int>
 1      20            0                  422         1          -0.660          3       19           27
 2     251            0                  278         2          -0.624          3        8           37
 3     117            0                  367         2          -0.714          4       13           16
 4      69            1                  383         2          -0.446          1        9            2
 5     277            0                  278         1          -0.736          4       13           37
 6     291            0                  422         2          -0.702          1        9           27
 7      69            0                  511         2          -0.183          1        9            7
 8     331            0                  273         1          -0.720          3        3           20
 9      69            0                   19         1          -0.726          1        9           36
10     261            1                  823         2          -0.509          1        9           25
# ... with 185,119 more rows

对数据集的所有列循环 - map_df

#求所有列的均值

> data %>% 
    map_df(mean)
    
# A tibble: 1 x 8
  city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
    <dbl>        <dbl>                <dbl>     <dbl>           <dbl>      <dbl>    <dbl>        <dbl>
1    142.        0.627                 426.      1.48          -0.562       2.20     11.6         26.2

‍

#全变量max-min标准化

> data %>% 
    map_df(function(x) ((x - min(x))/(max(x)-min(x))))


# A tibble: 185,129 x 8
   city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
     <dbl>        <dbl>                <dbl>     <dbl>           <dbl>      <dbl>    <dbl>        <dbl>
 1  0.0549       0.0769               0.472          0        0.00238        0.5    0.783         0.65 
 2  0.723        0.0769               0.310          1        0.00336        0.5    0.304         0.9  
 3  0.335        0.0769               0.411          1        0.000893       0.75   0.522         0.375
 4  0.197        0.154                0.429          1        0.00820        0      0.348         0.025
 5  0.798        0.0769               0.310          0        0.000293       0.75   0.522         0.9  
 6  0.838        0.0769               0.472          1        0.00123        0      0.348         0.65 
 7  0.197        0.0769               0.573          1        0.0154         0      0.348         0.15 
 8  0.954        0.0769               0.305          0        0.000737       0.5    0.0870        0.475
 9  0.197        0.0769               0.0191         0        0.000582       0      0.348         0.875
10  0.751        0.154                0.924          1        0.00649        0      0.348         0.6  
# ... with 185,119 more rows

#全变量Z-score归一化

> data %>% 
    map_df(scale)
    
# A tibble: 185,129 x 8
   city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
     <dbl>        <dbl>                <dbl>     <dbl>           <dbl>      <dbl>    <dbl>        <dbl>
 1  -1.23        -0.571              -0.0156    -0.952         -0.166       0.521    1.36        0.0798
 2   1.10        -0.571              -0.604      1.05          -0.105       0.521   -0.662       1.08  
 3  -0.251       -0.571              -0.240      1.05          -0.258       1.17     0.255      -1.02  
 4  -0.733        0.339              -0.175      1.05           0.195      -0.786   -0.479      -2.41  
 5   1.36        -0.571              -0.604     -0.952         -0.296       1.17     0.255       1.08  
 6   1.50        -0.571              -0.0156     1.05          -0.238      -0.786   -0.479       0.0798
 7  -0.733       -0.571               0.348      1.05           0.641      -0.786   -0.479      -1.92  
 8   1.90        -0.571              -0.624     -0.952         -0.268       0.521   -1.58       -0.619 
 9  -0.733       -0.571              -1.66      -0.952         -0.278      -0.786   -0.479       0.978 
10   1.20         0.339               1.62       1.05           0.0894     -0.786   -0.479      -0.120 
# ... with 185,119 more rows

#全变量log处理

> data %>% 
    map_df(function(x) log(x + 1)) 
    
# A tibble: 185,129 x 8
   city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
     <dbl>        <dbl>                <dbl>     <dbl>           <dbl>      <dbl>    <dbl>        <dbl>
 1    3.04        0                     6.05     0.693          -1.08       1.39      3.00         3.33
 2    5.53        0                     5.63     1.10           -0.977      1.39      2.20         3.64
 3    4.77        0                     5.91     1.10           -1.25       1.61      2.64         2.83
 4    4.25        0.693                 5.95     1.10           -0.591      0.693     2.30         1.10
 5    5.63        0                     5.63     0.693          -1.33       1.61      2.64         3.64
 6    5.68        0                     6.05     1.10           -1.21       0.693     2.30         3.33
 7    4.25        0                     6.24     1.10           -0.203      0.693     2.30         2.08
 8    5.81        0                     5.61     0.693          -1.27       1.39      1.39         3.04
 9    4.25        0                     3.00     0.693          -1.29       0.693     2.30         3.61
10    5.57        0.693                 6.71     1.10           -0.711      0.693     2.30         3.26
# ... with 185,119 more rows

指定数据集中的列做循环 - modify_at

#选择对installments列做log处理

> data %>% 
    modify_at(c("installments"), function(x) log(x + 1)) 


# A tibble: 185,129 x 8
   city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
     <int>        <dbl>                <int>     <int>           <dbl>      <int>    <int>        <int>
 1      20        0                      422         1          -0.660          3       19           27
 2     251        0                      278         2          -0.624          3        8           37
 3     117        0                      367         2          -0.714          4       13           16
 4      69        0.693                  383         2          -0.446          1        9            2
 5     277        0                      278         1          -0.736          4       13           37
 6     291        0                      422         2          -0.702          1        9           27
 7      69        0                      511         2          -0.183          1        9            7
 8     331        0                      273         1          -0.720          3        3           20
 9      69        0                       19         1          -0.726          1        9           36
10     261        0.693                  823         2          -0.509          1        9           25
# ... with 185,119 more rows

#选择对installments列做max-min标准化

> data %>% 
    modify_at(c("installments"), function(x) ((x - min(x))/(max(x) - min(x))))


# A tibble: 185,129 x 8
   city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
     <int>        <dbl>                <int>     <int>           <dbl>      <int>    <int>        <int>
 1      20       0.0769                  422         1          -0.660          3       19           27
 2     251       0.0769                  278         2          -0.624          3        8           37
 3     117       0.0769                  367         2          -0.714          4       13           16
 4      69       0.154                   383         2          -0.446          1        9            2
 5     277       0.0769                  278         1          -0.736          4       13           37
 6     291       0.0769                  422         2          -0.702          1        9           27
 7      69       0.0769                  511         2          -0.183          1        9            7
 8     331       0.0769                  273         1          -0.720          3        3           20
 9      69       0.0769                   19         1          -0.726          1        9           36
10     261       0.154                   823         2          -0.509          1        9           25
# ... with 185,119 more rows

#选择对installments做列Z-score归一化

> data %>% 
    modify_at(c("installments"), scale)


# A tibble: 185,129 x 8
   city_id installments[,1] merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
     <int>            <dbl>                <int>     <int>           <dbl>      <int>    <int>        <int>
 1      20           -0.571                  422         1          -0.660          3       19           27
 2     251           -0.571                  278         2          -0.624          3        8           37
 3     117           -0.571                  367         2          -0.714          4       13           16
 4      69            0.339                  383         2          -0.446          1        9            2
 5     277           -0.571                  278         1          -0.736          4       13           37
 6     291           -0.571                  422         2          -0.702          1        9           27
 7      69           -0.571                  511         2          -0.183          1        9            7
 8     331           -0.571                  273         1          -0.720          3        3           20
 9      69           -0.571                   19         1          -0.726          1        9           36
10     261            0.339                  823         2          -0.509          1        9           25
# ... with 185,119 more rows

指定条件保留or剔除某列

#保留列均值大于10的列

> data %>% 
    keep(function(x) mean(x) > 10) 


# A tibble: 185,129 x 4
   city_id merchant_category_id state_id subsector_id
     <int>                <int>    <int>        <int>
 1      20                  422       19           27
 2     251                  278        8           37
 3     117                  367       13           16
 4      69                  383        9            2
 5     277                  278       13           37
 6     291                  422        9           27
 7      69                  511        9            7
 8     331                  273        3           20
 9      69                   19        9           36
10     261                  823        9           25
# ... with 185,119 more rows

#剔除列均值大于10的列

> data %>% 
    discard(function(x) mean(x) > 10) #剔除列均值大于10的列


# A tibble: 185,129 x 4
   installments month_lag purchase_amount category_2
          <int>     <int>           <dbl>      <int>
 1            0         1          -0.660          3
 2            0         2          -0.624          3
 3            0         2          -0.714          4
 4            1         2          -0.446          1
 5            0         1          -0.736          4
 6            0         2          -0.702          1
 7            0         2          -0.183          1
 8            0         1          -0.720          3
 9            0         1          -0.726          1
10            1         2          -0.509          1
# ... with 185,119 more rows

最后

相信写到这里大家已经基本理解了tidyr&purrr包的功能与用法，数据清洗作为特征工程中重要的一环，熟练掌握数据表的重塑与清洗方法才能为数据分析提供最基本的支撑。

注：purrr包本文只介绍了对于数据框为对象的函数功能，此R包其实着重于处理list类型的数据，有兴趣的同学可以学习一下。

·END·

R语言与数据分析

生产力干货

微信号：RforData