前言
tidyr - 用于数据集的长宽表转换、分列等形状重塑(reshape)
purrr - 实现了对变量数据的快速循环,让我们可以在对数据集内列变量进行循环时摒弃缓慢的for循环,提高数据处理效率与代码简洁性
数据准备
> pacman::p_load(data.table, tidyverse) #也可以直接library
> data <- fread("D://contest//transactions.csv")
> #将数据框转换为tibble形式数据框,为dplyr函数加快处理速度
> data <- tbl_df(data)
> #查看行列信息
> names(data)
[1] "authorized_flag" "card_id" "city_id" "category_1" "installments" "category_3"
[7] "merchant_category_id" "merchant_id" "month_lag" "purchase_amount" "purchase_date" "category_2"
[13] "state_id" "subsector_id"
> dim(data)
[1] 185129 14
#185129行,14列
> data
# A tibble: 185,129 x 14
authorized_flag card_id city_id category_1 installments category_3 merchant_category~ merchant_id month_lag purchase_amount purchase_date category_2 state_id subsector_id
<chr> <chr> <int> <chr> <int> <chr> <int> <chr> <int> <dbl> <chr> <int> <int> <int>
1 Y C_ID_cb34e~ 20 N 0 A 422 M_ID_f162748~ 1 -0.660 2018-03-07 10:5~ 3 19 27
2 Y C_ID_8a118~ 251 N 0 A 278 M_ID_00c57ea~ 2 -0.624 2017-07-21 12:1~ 3 8 37
3 Y C_ID_69ed7~ 117 N 0 A 367 M_ID_1da88fb~ 2 -0.714 2017-10-27 20:2~ 4 13 16
4 Y C_ID_ae6f8~ 69 N 1 B 383 M_ID_ebbdb42~ 2 -0.446 2018-01-26 08:4~ 1 9 2
5 Y C_ID_07b21~ 277 N 0 A 278 M_ID_7291e8a~ 1 -0.736 2018-03-10 21:4~ 4 13 37
6 Y C_ID_34521~ 291 N 0 A 422 M_ID_0649e6e~ 2 -0.702 2018-03-11 19:3~ 1 9 27
7 Y C_ID_5d09f~ 69 N 0 A 511 M_ID_b794b9d~ 2 -0.183 2018-04-21 13:1~ 1 9 7
8 Y C_ID_a12f5~ 331 N 0 A 273 M_ID_73d19e5~ 1 -0.720 2018-03-21 06:5~ 3 3 20
9 Y C_ID_88fb2~ 69 N 0 A 19 M_ID_a79f97e~ 1 -0.726 2018-03-01 16:5~ 1 9 36
10 Y C_ID_8ef5e~ 261 N 1 B 823 M_ID_2059928~ 2 -0.509 2018-03-04 03:5~ 1 9 25
# ... with 185,119 more rows
#筛选出card_id, category_2, category_3三个列
#并取前10行赋值到category
> data %>%
select(card_id, category_2, category_3) %>%
slice(1:10)-> category
#在category中插入一个缺失值
> category[3, 3] <- NA
> category
# A tibble: 10 x 3
card_id category_2 category_3
<chr> <int> <chr>
1 C_ID_cb34e13a1e 3 A
2 C_ID_8a1184a21e 3 A
3 C_ID_69ed7c9177 4 NA
4 C_ID_ae6f8dd403 1 B
5 C_ID_07b2107875 4 A
6 C_ID_34521fce1b 1 A
7 C_ID_5d09feb8b0 1 A
8 C_ID_a12f57ce14 3 A
9 C_ID_88fb23a843 1 A
10 C_ID_8ef5e8c067 1 B
我们以一个kaggle上的信用卡消费数据集为例
百度网盘下载:
链接:https://pan.baidu.com/s/1Qv3nAJxfo7hxjdTOGoLOXA
提取码:f4ks
tidyr
1
转长表 - gather
#在对数据集category进行转长表操作时,要指定以哪些列为“key”
#这里指定category_3, category_2两列为key,重塑后数据集形状如下
> category %>%
gather(category_3, category_2, key = "category", value = "cases") -> category_long
> category_long
# A tibble: 20 x 3
card_id category cases
<chr> <chr> <chr>
1 C_ID_cb34e13a1e category_3 A
2 C_ID_8a1184a21e category_3 A
3 C_ID_69ed7c9177 category_3 NA
4 C_ID_ae6f8dd403 category_3 B
5 C_ID_07b2107875 category_3 A
6 C_ID_34521fce1b category_3 A
7 C_ID_5d09feb8b0 category_3 A
8 C_ID_a12f57ce14 category_3 A
9 C_ID_88fb23a843 category_3 A
10 C_ID_8ef5e8c067 category_3 B
11 C_ID_cb34e13a1e category_2 3
12 C_ID_8a1184a21e category_2 3
13 C_ID_69ed7c9177 category_2 4
14 C_ID_ae6f8dd403 category_2 1
15 C_ID_07b2107875 category_2 4
16 C_ID_34521fce1b category_2 1
17 C_ID_5d09feb8b0 category_2 1
18 C_ID_a12f57ce14 category_2 3
19 C_ID_88fb23a843 category_2 1
20 C_ID_8ef5e8c067 category_2 1
#我们可以看到列category内罗列出了被指定的“key”
#cases列内的值为重塑前category_3, category_2列包含的值
#同时,card_id也自动匹配并延展为对应长度
2
转宽表 - spread
#对应的,指定category为“key”,cases为“value”进行转宽表
> category_long %>%
spread(category, cases)
# A tibble: 10 x 3
card_id category_2 category_3
<chr> <chr> <chr>
1 C_ID_07b2107875 4 A
2 C_ID_34521fce1b 1 A
3 C_ID_5d09feb8b0 1 A
4 C_ID_69ed7c9177 4 NA
5 C_ID_88fb23a843 1 A
6 C_ID_8a1184a21e 3 A
7 C_ID_8ef5e8c067 1 B
8 C_ID_a12f57ce14 3 A
9 C_ID_ae6f8dd403 1 B
10 C_ID_cb34e13a1e 3 A
3
分列 - separate
#将card_id列以"_"为区分进行分列,将分列后产生的3个列分别命名为:"noneed", "noneed2", "id",最后赋值到变量category_sep
> category %>%
separate(card_id, sep = "_", into = c("noneed", "noneed2", "id")) -> category_sep
> category_sep
# A tibble: 10 x 5
noneed noneed2 id category_2 category_3
<chr> <chr> <chr> <int> <chr>
1 C ID cb34e13a1e 3 A
2 C ID 8a1184a21e 3 A
3 C ID 69ed7c9177 4 NA
4 C ID ae6f8dd403 1 B
5 C ID 07b2107875 4 A
6 C ID 34521fce1b 1 A
7 C ID 5d09feb8b0 1 A
8 C ID a12f57ce14 3 A
9 C ID 88fb23a843 1 A
10 C ID 8ef5e8c067 1 B
4
变量合并 - unite
#将noneed, noneed2, id合并为一列,命名为"ID",连接符号为":"
> category_sep %>%
unite(noneed, noneed2, id, col = "ID", sep = ":")
# A tibble: 10 x 3
ID category_2 category_3
<chr> <int> <chr>
1 C:ID:cb34e13a1e 3 A
2 C:ID:8a1184a21e 3 A
3 C:ID:69ed7c9177 4 NA
4 C:ID:ae6f8dd403 1 B
5 C:ID:07b2107875 4 A
6 C:ID:34521fce1b 1 A
7 C:ID:5d09feb8b0 1 A
8 C:ID:a12f57ce14 3 A
9 C:ID:88fb23a843 1 A
10 C:ID:8ef5e8c067 1 B
5
分列同时转长表 - separate_rows
#这里直接对card_id列进行了根据"_"字符的分割后,直接以原列为“key”进行了长表转换,省去了单纯长表转换separate函数的新列命名过程。
> category %>%
separate_rows(card_id, sep = "_") #分列&转长表
# A tibble: 30 x 3
card_id category_2 category_3
<chr> <int> <chr>
1 C 3 A
2 ID 3 A
3 cb34e13a1e 3 A
4 C 3 A
5 ID 3 A
6 8a1184a21e 3 A
7 C 4 NA
8 ID 4 NA
9 69ed7c9177 4 NA
10 C 1 B
# ... with 20 more rows
5
缺失值处理
这里引入tidyr包中的三个函数(刚才插入的NA值是为了这里的):
#删除带有NA的行
> category %>%
drop_na()
# A tibble: 9 x 3
card_id category_2 category_3
<chr> <int> <chr>
1 C_ID_cb34e13a1e 3 A
2 C_ID_8a1184a21e 3 A
3 C_ID_ae6f8dd403 1 B
4 C_ID_07b2107875 4 A
5 C_ID_34521fce1b 1 A
6 C_ID_5d09feb8b0 1 A
7 C_ID_a12f57ce14 3 A
8 C_ID_88fb23a843 1 A
9 C_ID_8ef5e8c067 1 B
#使用相邻的值替换NA
> category %>%
fill(category_3)
# A tibble: 10 x 3
card_id category_2 category_3
<chr> <int> <chr>
1 C_ID_cb34e13a1e 3 A
2 C_ID_8a1184a21e 3 A
3 C_ID_69ed7c9177 4 A
4 C_ID_ae6f8dd403 1 B
5 C_ID_07b2107875 4 A
6 C_ID_34521fce1b 1 A
7 C_ID_5d09feb8b0 1 A
8 C_ID_a12f57ce14 3 A
9 C_ID_88fb23a843 1 A
10 C_ID_8ef5e8c067 1 B
#指定值替换NA
> category %>%
replace_na(list(category_3 = "X"))
# A tibble: 10 x 3
card_id category_2 category_3
<chr> <int> <chr>
1 C_ID_cb34e13a1e 3 A
2 C_ID_8a1184a21e 3 A
3 C_ID_69ed7c9177 4 X
4 C_ID_ae6f8dd403 1 B
5 C_ID_07b2107875 4 A
6 C_ID_34521fce1b 1 A
7 C_ID_5d09feb8b0 1 A
8 C_ID_a12f57ce14 3 A
9 C_ID_88fb23a843 1 A
10 C_ID_8ef5e8c067 1 B
purrr
#数据准备 - 我们只取data中的数值变量
>data %>%
select_if(is.numeric) -> data
> data
# A tibble: 185,129 x 8
city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<int> <int> <int> <int> <dbl> <int> <int> <int>
1 20 0 422 1 -0.660 3 19 27
2 251 0 278 2 -0.624 3 8 37
3 117 0 367 2 -0.714 4 13 16
4 69 1 383 2 -0.446 1 9 2
5 277 0 278 1 -0.736 4 13 37
6 291 0 422 2 -0.702 1 9 27
7 69 0 511 2 -0.183 1 9 7
8 331 0 273 1 -0.720 3 3 20
9 69 0 19 1 -0.726 1 9 36
10 261 1 823 2 -0.509 1 9 25
# ... with 185,119 more rows
1
对数据集的所有列循环 - map_df
#求所有列的均值
> data %>%
map_df(mean)
# A tibble: 1 x 8
city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 142. 0.627 426. 1.48 -0.562 2.20 11.6 26.2
#全变量max-min标准化
> data %>%
map_df(function(x) ((x - min(x))/(max(x)-min(x))))
# A tibble: 185,129 x 8
city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.0549 0.0769 0.472 0 0.00238 0.5 0.783 0.65
2 0.723 0.0769 0.310 1 0.00336 0.5 0.304 0.9
3 0.335 0.0769 0.411 1 0.000893 0.75 0.522 0.375
4 0.197 0.154 0.429 1 0.00820 0 0.348 0.025
5 0.798 0.0769 0.310 0 0.000293 0.75 0.522 0.9
6 0.838 0.0769 0.472 1 0.00123 0 0.348 0.65
7 0.197 0.0769 0.573 1 0.0154 0 0.348 0.15
8 0.954 0.0769 0.305 0 0.000737 0.5 0.0870 0.475
9 0.197 0.0769 0.0191 0 0.000582 0 0.348 0.875
10 0.751 0.154 0.924 1 0.00649 0 0.348 0.6
# ... with 185,119 more rows
#全变量Z-score归一化
> data %>%
map_df(scale)
# A tibble: 185,129 x 8
city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -1.23 -0.571 -0.0156 -0.952 -0.166 0.521 1.36 0.0798
2 1.10 -0.571 -0.604 1.05 -0.105 0.521 -0.662 1.08
3 -0.251 -0.571 -0.240 1.05 -0.258 1.17 0.255 -1.02
4 -0.733 0.339 -0.175 1.05 0.195 -0.786 -0.479 -2.41
5 1.36 -0.571 -0.604 -0.952 -0.296 1.17 0.255 1.08
6 1.50 -0.571 -0.0156 1.05 -0.238 -0.786 -0.479 0.0798
7 -0.733 -0.571 0.348 1.05 0.641 -0.786 -0.479 -1.92
8 1.90 -0.571 -0.624 -0.952 -0.268 0.521 -1.58 -0.619
9 -0.733 -0.571 -1.66 -0.952 -0.278 -0.786 -0.479 0.978
10 1.20 0.339 1.62 1.05 0.0894 -0.786 -0.479 -0.120
# ... with 185,119 more rows
#全变量log处理
> data %>%
map_df(function(x) log(x + 1))
# A tibble: 185,129 x 8
city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 3.04 0 6.05 0.693 -1.08 1.39 3.00 3.33
2 5.53 0 5.63 1.10 -0.977 1.39 2.20 3.64
3 4.77 0 5.91 1.10 -1.25 1.61 2.64 2.83
4 4.25 0.693 5.95 1.10 -0.591 0.693 2.30 1.10
5 5.63 0 5.63 0.693 -1.33 1.61 2.64 3.64
6 5.68 0 6.05 1.10 -1.21 0.693 2.30 3.33
7 4.25 0 6.24 1.10 -0.203 0.693 2.30 2.08
8 5.81 0 5.61 0.693 -1.27 1.39 1.39 3.04
9 4.25 0 3.00 0.693 -1.29 0.693 2.30 3.61
10 5.57 0.693 6.71 1.10 -0.711 0.693 2.30 3.26
# ... with 185,119 more rows
2
指定数据集中的列做循环 - modify_at
#选择对installments列做log处理
> data %>%
modify_at(c("installments"), function(x) log(x + 1))
# A tibble: 185,129 x 8
city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<int> <dbl> <int> <int> <dbl> <int> <int> <int>
1 20 0 422 1 -0.660 3 19 27
2 251 0 278 2 -0.624 3 8 37
3 117 0 367 2 -0.714 4 13 16
4 69 0.693 383 2 -0.446 1 9 2
5 277 0 278 1 -0.736 4 13 37
6 291 0 422 2 -0.702 1 9 27
7 69 0 511 2 -0.183 1 9 7
8 331 0 273 1 -0.720 3 3 20
9 69 0 19 1 -0.726 1 9 36
10 261 0.693 823 2 -0.509 1 9 25
# ... with 185,119 more rows
#选择对installments列做max-min标准化
> data %>%
modify_at(c("installments"), function(x) ((x - min(x))/(max(x) - min(x))))
# A tibble: 185,129 x 8
city_id installments merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<int> <dbl> <int> <int> <dbl> <int> <int> <int>
1 20 0.0769 422 1 -0.660 3 19 27
2 251 0.0769 278 2 -0.624 3 8 37
3 117 0.0769 367 2 -0.714 4 13 16
4 69 0.154 383 2 -0.446 1 9 2
5 277 0.0769 278 1 -0.736 4 13 37
6 291 0.0769 422 2 -0.702 1 9 27
7 69 0.0769 511 2 -0.183 1 9 7
8 331 0.0769 273 1 -0.720 3 3 20
9 69 0.0769 19 1 -0.726 1 9 36
10 261 0.154 823 2 -0.509 1 9 25
# ... with 185,119 more rows
#选择对installments做列Z-score归一化
> data %>%
modify_at(c("installments"), scale)
# A tibble: 185,129 x 8
city_id installments[,1] merchant_category_id month_lag purchase_amount category_2 state_id subsector_id
<int> <dbl> <int> <int> <dbl> <int> <int> <int>
1 20 -0.571 422 1 -0.660 3 19 27
2 251 -0.571 278 2 -0.624 3 8 37
3 117 -0.571 367 2 -0.714 4 13 16
4 69 0.339 383 2 -0.446 1 9 2
5 277 -0.571 278 1 -0.736 4 13 37
6 291 -0.571 422 2 -0.702 1 9 27
7 69 -0.571 511 2 -0.183 1 9 7
8 331 -0.571 273 1 -0.720 3 3 20
9 69 -0.571 19 1 -0.726 1 9 36
10 261 0.339 823 2 -0.509 1 9 25
# ... with 185,119 more rows
3
指定条件保留or剔除某列
#保留列均值大于10的列
> data %>%
keep(function(x) mean(x) > 10)
# A tibble: 185,129 x 4
city_id merchant_category_id state_id subsector_id
<int> <int> <int> <int>
1 20 422 19 27
2 251 278 8 37
3 117 367 13 16
4 69 383 9 2
5 277 278 13 37
6 291 422 9 27
7 69 511 9 7
8 331 273 3 20
9 69 19 9 36
10 261 823 9 25
# ... with 185,119 more rows
#剔除列均值大于10的列
> data %>%
discard(function(x) mean(x) > 10) #剔除列均值大于10的列
# A tibble: 185,129 x 4
installments month_lag purchase_amount category_2
<int> <int> <dbl> <int>
1 0 1 -0.660 3
2 0 2 -0.624 3
3 0 2 -0.714 4
4 1 2 -0.446 1
5 0 1 -0.736 4
6 0 2 -0.702 1
7 0 2 -0.183 1
8 0 1 -0.720 3
9 0 1 -0.726 1
10 1 2 -0.509 1
# ... with 185,119 more rows
最后
相信写到这里大家已经基本理解了tidyr&purrr包的功能与用法,数据清洗作为特征工程中重要的一环,熟练掌握数据表的重塑与清洗方法才能为数据分析提供最基本的支撑。
注:purrr包本文只介绍了对于数据框为对象的函数功能,此R包其实着重于处理list类型的数据,有兴趣的同学可以学习一下。
·END·
R语言与数据分析
生产力干货