前言
这篇文章中,我们汇总了stringr包对数据框内的字符串的各种操作,其中包括:存在检测;字符提取;更改替换;分列&合并。
stringr官方文档多是以向量对象为参考。而【Tidyverse For Dataframe】系列文章中:
原则上,我们希望做到所有操作的结果全部依赖data.frame对象,不溢出产生其他类型。
固,这篇文章内使用dplyr语句对stringr函数做嵌套,使后者的结果直接赋值到dataframe对象中,方便读者在实际生产中直接进行表操作。
dplyr函数操作详见:【R】dplyr - 数据处理瑞士军刀
管道操作符详见:【R】magrittr - 4种管道操作符活用教程
数据准备
pacman::p_load(data.table, tidyverse) #也可以直接library
data <- fread("D://contest//transactions.csv")
#将数据框转换为tibble形式数据框,为dplyr函数加快处理速度
#提取card_id, merchant_id两列,重新赋值为data
data <- data %>%
tbl_df() %>%
select(card_id, merchant_id)
> data
# A tibble: 185,129 x 2
card_id merchant_id
<chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793
2 C_ID_8a1184a21e M_ID_00c57eaac8
3 C_ID_69ed7c9177 M_ID_1da88fbd75
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6
5 C_ID_07b2107875 M_ID_7291e8adc1
6 C_ID_34521fce1b M_ID_0649e6e556
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8
8 C_ID_a12f57ce14 M_ID_73d19e5b76
9 C_ID_88fb23a843 M_ID_a79f97e414
10 C_ID_8ef5e8c067 M_ID_20599284b1
# ... with 185,119 more rows
我们以一个kaggle上的信用卡消费数据集为例
百度网盘下载:
链接:https://pan.baidu.com/s/1Qv3nAJxfo7hxjdTOGoLOXA
提取码:f4ks
存在检测
1
筛选包含 - str_detect
#筛选card_id列包含“a2”字符的行
data %>%
filter(str_detect(card_id, "a2"))
# A tibble: 6,484 x 2
card_id merchant_id
<chr> <chr>
1 C_ID_8a1184a21e M_ID_00c57eaac8
2 C_ID_9b46dc1aa2 M_ID_b3372d49d4
3 C_ID_17a220c9f8 M_ID_e8b5c4cefd
4 C_ID_e1ae6b02a2 M_ID_11199ebd64
5 C_ID_ca2e8daf18 M_ID_de95a7ddcb
6 C_ID_667849a27d M_ID_ad2f6b1db9
7 C_ID_e13280dca2 M_ID_a870db5f63
8 C_ID_d2a2842aa6 M_ID_54aa7a0ad8
9 C_ID_36988f4aa2 M_ID_fa90130a67
10 C_ID_de0a294c48 M_ID_7f48fe6e0d
# ... with 6,474 more rows
2
出现次数 - str_count
#统计card_id列中每行“a”字符出现的次数,赋值到新列a_count中
data %>%
mutate(a_count = str_count(card_id, "a"))
# A tibble: 185,129 x 3
card_id merchant_id a_count
<chr> <chr> <int>
1 C_ID_cb34e13a1e M_ID_f162748793 1
2 C_ID_8a1184a21e M_ID_00c57eaac8 2
3 C_ID_69ed7c9177 M_ID_1da88fbd75 0
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 1
5 C_ID_07b2107875 M_ID_7291e8adc1 0
6 C_ID_34521fce1b M_ID_0649e6e556 0
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 1
9 C_ID_88fb23a843 M_ID_a79f97e414 1
10 C_ID_8ef5e8c067 M_ID_20599284b1 0
# ... with 185,119 more rows
3
提取存在 - str_match
#包含a1则显示a1,否则显示NA,赋值到新列a_count中
data %>%
mutate(ID = str_match(card_id, "a1"))
# A tibble: 185,129 x 3
card_id merchant_id ID[,1]
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 a1
2 C_ID_8a1184a21e M_ID_00c57eaac8 a1
3 C_ID_69ed7c9177 M_ID_1da88fbd75 NA
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 NA
5 C_ID_07b2107875 M_ID_7291e8adc1 NA
6 C_ID_34521fce1b M_ID_0649e6e556 NA
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 NA
8 C_ID_a12f57ce14 M_ID_73d19e5b76 a1
9 C_ID_88fb23a843 M_ID_a79f97e414 NA
10 C_ID_8ef5e8c067 M_ID_20599284b1 NA
# ... with 185,119 more rows
4
存在示位 - str_locate
#查看字符a1在card_id列中的存在与否,和起始、结束位置;起始位置赋值到start列,结束位置赋值到end列
data %>%
mutate(start = str_locate(card_id, "a1")[, 1]) %>%
mutate(end = str_locate(card_id, "a1")[, 2])
# A tibble: 185,129 x 4
card_id merchant_id start end
<chr> <chr> <int> <int>
1 C_ID_cb34e13a1e M_ID_f162748793 13 14
2 C_ID_8a1184a21e M_ID_00c57eaac8 7 8
3 C_ID_69ed7c9177 M_ID_1da88fbd75 NA NA
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 NA NA
5 C_ID_07b2107875 M_ID_7291e8adc1 NA NA
6 C_ID_34521fce1b M_ID_0649e6e556 NA NA
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 NA NA
8 C_ID_a12f57ce14 M_ID_73d19e5b76 6 7
9 C_ID_88fb23a843 M_ID_a79f97e414 NA NA
10 C_ID_8ef5e8c067 M_ID_20599284b1 NA NA
# ... with 185,119 more rows
5
长度计算 - str_length
##查看card_id列中每行字符串对应的长度,赋值到lenth列
data %>%
mutate(lenth = str_length(card_id))
# A tibble: 185,129 x 3
card_id merchant_id lenth
<chr> <chr> <int>
1 C_ID_cb34e13a1e M_ID_f162748793 15
2 C_ID_8a1184a21e M_ID_00c57eaac8 15
3 C_ID_69ed7c9177 M_ID_1da88fbd75 15
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 15
5 C_ID_07b2107875 M_ID_7291e8adc1 15
6 C_ID_34521fce1b M_ID_0649e6e556 15
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 15
8 C_ID_a12f57ce14 M_ID_73d19e5b76 15
9 C_ID_88fb23a843 M_ID_a79f97e414 15
10 C_ID_8ef5e8c067 M_ID_20599284b1 15
# ... with 185,119 more rows
字符提取
1
位置截取 - str_sub
#截取card_id列1-6位字符,赋值到新建列first_id
data %>%
mutate(first_id = str_sub(card_id, 1, 6))
# A tibble: 185,129 x 3
card_id merchant_id first_id
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 C_ID_c
2 C_ID_8a1184a21e M_ID_00c57eaac8 C_ID_8
3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_ID_6
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_ID_a
5 C_ID_07b2107875 M_ID_7291e8adc1 C_ID_0
6 C_ID_34521fce1b M_ID_0649e6e556 C_ID_3
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_ID_5
8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_ID_a
9 C_ID_88fb23a843 M_ID_a79f97e414 C_ID_8
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_ID_8
# ... with 185,119 more rows
#可以指设置初始位置或结束位置
data %>%
mutate(first_id = str_sub(card_id, 6)) #从第6个字符开始截取,包含第6个字符
# A tibble: 185,129 x 3
card_id merchant_id first_id
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 cb34e13a1e
2 C_ID_8a1184a21e M_ID_00c57eaac8 8a1184a21e
3 C_ID_69ed7c9177 M_ID_1da88fbd75 69ed7c9177
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 ae6f8dd403
5 C_ID_07b2107875 M_ID_7291e8adc1 07b2107875
6 C_ID_34521fce1b M_ID_0649e6e556 34521fce1b
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 5d09feb8b0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 a12f57ce14
9 C_ID_88fb23a843 M_ID_a79f97e414 88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 8ef5e8c067
# ... with 185,119 more rows
#也可以设置成负数,表示从倒数第几个字符开始截取
data %>%
mutate(first_id = str_sub(card_id, -5))
# A tibble: 185,129 x 3
card_id merchant_id first_id
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 13a1e
2 C_ID_8a1184a21e M_ID_00c57eaac8 4a21e
3 C_ID_69ed7c9177 M_ID_1da88fbd75 c9177
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 dd403
5 C_ID_07b2107875 M_ID_7291e8adc1 07875
6 C_ID_34521fce1b M_ID_0649e6e556 fce1b
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 eb8b0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 7ce14
9 C_ID_88fb23a843 M_ID_a79f97e414 3a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 8c067
# ... with 185,119 more rows
2
首配提取 - str_extract
#使用正则表达式,提取第一个出现的数字
data %>%
mutate(number = str_extract(card_id, '[0-9]'))
# A tibble: 185,129 x 3
card_id merchant_id number
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 3
2 C_ID_8a1184a21e M_ID_00c57eaac8 8
3 C_ID_69ed7c9177 M_ID_1da88fbd75 6
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 6
5 C_ID_07b2107875 M_ID_7291e8adc1 0
6 C_ID_34521fce1b M_ID_0649e6e556 3
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 5
8 C_ID_a12f57ce14 M_ID_73d19e5b76 1
9 C_ID_88fb23a843 M_ID_a79f97e414 8
10 C_ID_8ef5e8c067 M_ID_20599284b1 8
# ... with 185,119 more rows
更改替换
1
首配替换 - str_replace
#将card_id列中第一个C_ID字符替换为card,赋值到新建列ID中
data %>%
mutate(ID = str_replace(card_id, "C_ID", "card"))
# A tibble: 185,129 x 3
card_id merchant_id ID
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 card_cb34e13a1e
2 C_ID_8a1184a21e M_ID_00c57eaac8 card_8a1184a21e
3 C_ID_69ed7c9177 M_ID_1da88fbd75 card_69ed7c9177
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 card_ae6f8dd403
5 C_ID_07b2107875 M_ID_7291e8adc1 card_07b2107875
6 C_ID_34521fce1b M_ID_0649e6e556 card_34521fce1b
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 card_5d09feb8b0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 card_a12f57ce14
9 C_ID_88fb23a843 M_ID_a79f97e414 card_88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 card_8ef5e8c067
# ... with 185,119 more rows
2
全部替换 - str_replace_all
#将card_id列中每行所有的a字符替换为Z,赋值到新建列ID中
data %>%
mutate(ID = str_replace_all(card_id, "a", "Z"))
# A tibble: 185,129 x 3
card_id merchant_id ID
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 C_ID_cb34e13Z1e
2 C_ID_8a1184a21e M_ID_00c57eaac8 C_ID_8Z1184Z21e
3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_ID_69ed7c9177
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_ID_Ze6f8dd403
5 C_ID_07b2107875 M_ID_7291e8adc1 C_ID_07b2107875
6 C_ID_34521fce1b M_ID_0649e6e556 C_ID_34521fce1b
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_ID_5d09feb8b0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_ID_Z12f57ce14
9 C_ID_88fb23a843 M_ID_a79f97e414 C_ID_88fb23Z843
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_ID_8ef5e8c067
# ... with 185,119 more rows
3
全部大写 - str_to_upper
#将card_id列转换为大写,赋值到新建列ID中
data %>%
mutate(ID = str_to_upper(card_id))
# A tibble: 185,129 x 3
card_id merchant_id ID
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 C_ID_CB34E13A1E
2 C_ID_8a1184a21e M_ID_00c57eaac8 C_ID_8A1184A21E
3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_ID_69ED7C9177
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_ID_AE6F8DD403
5 C_ID_07b2107875 M_ID_7291e8adc1 C_ID_07B2107875
6 C_ID_34521fce1b M_ID_0649e6e556 C_ID_34521FCE1B
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_ID_5D09FEB8B0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_ID_A12F57CE14
9 C_ID_88fb23a843 M_ID_a79f97e414 C_ID_88FB23A843
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_ID_8EF5E8C067
# ... with 185,119 more rows
4
全部小写 - str_to_lower
#将card_id列转换为小写,赋值到新建列ID中
data %>%
mutate(ID = str_to_lower(card_id))
# A tibble: 185,129 x 3
card_id merchant_id ID
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 c_id_cb34e13a1e
2 C_ID_8a1184a21e M_ID_00c57eaac8 c_id_8a1184a21e
3 C_ID_69ed7c9177 M_ID_1da88fbd75 c_id_69ed7c9177
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 c_id_ae6f8dd403
5 C_ID_07b2107875 M_ID_7291e8adc1 c_id_07b2107875
6 C_ID_34521fce1b M_ID_0649e6e556 c_id_34521fce1b
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 c_id_5d09feb8b0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 c_id_a12f57ce14
9 C_ID_88fb23a843 M_ID_a79f97e414 c_id_88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 c_id_8ef5e8c067
# ... with 185,119 more rows
5
首字大写 - str_to_title
#将card_id列转换为单词首字母大写(以空格为区分),赋值到新建列ID中
data %>%
mutate(ID = str_to_title(card_id))
# A tibble: 185,129 x 3
card_id merchant_id ID
<chr> <chr> <chr>
1 C_ID_cb34e13a1e M_ID_f162748793 C_id_cb34e13a1e
2 C_ID_8a1184a21e M_ID_00c57eaac8 C_id_8a1184a21e
3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_id_69ed7c9177
4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_id_ae6f8dd403
5 C_ID_07b2107875 M_ID_7291e8adc1 C_id_07b2107875
6 C_ID_34521fce1b M_ID_0649e6e556 C_id_34521fce1b
7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_id_5d09feb8b0
8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_id_a12f57ce14
9 C_ID_88fb23a843 M_ID_a79f97e414 C_id_88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_id_8ef5e8c067
# ... with 185,119 more rows
分列&合并
1
变量分列 - separate(tidyr包)
#将card_id列以"_"字符分列为"noneed", "noneed2", "id"三个列,新数据框赋值到category_sep
data %>%
separate(card_id, sep = "_", into = c("noneed", "noneed2", "id")) -> category_sep
> category_sep
# A tibble: 185,129 x 4
noneed noneed2 id merchant_id
<chr> <chr> <chr> <chr>
1 C ID cb34e13a1e M_ID_f162748793
2 C ID 8a1184a21e M_ID_00c57eaac8
3 C ID 69ed7c9177 M_ID_1da88fbd75
4 C ID ae6f8dd403 M_ID_ebbdb42da6
5 C ID 07b2107875 M_ID_7291e8adc1
6 C ID 34521fce1b M_ID_0649e6e556
7 C ID 5d09feb8b0 M_ID_b794b9d9e8
8 C ID a12f57ce14 M_ID_73d19e5b76
9 C ID 88fb23a843 M_ID_a79f97e414
10 C ID 8ef5e8c067 M_ID_20599284b1
# ... with 185,119 more rows
2
变量合并 - unite(tidyr包)
#将"noneed", "noneed2", "id"三个列重新合并为ID单列,连接符为":"
category_sep %>%
unite(noneed, noneed2, id, col = "ID", sep = ":")
# A tibble: 185,129 x 2
ID merchant_id
<chr> <chr>
1 C:ID:cb34e13a1e M_ID_f162748793
2 C:ID:8a1184a21e M_ID_00c57eaac8
3 C:ID:69ed7c9177 M_ID_1da88fbd75
4 C:ID:ae6f8dd403 M_ID_ebbdb42da6
5 C:ID:07b2107875 M_ID_7291e8adc1
6 C:ID:34521fce1b M_ID_0649e6e556
7 C:ID:5d09feb8b0 M_ID_b794b9d9e8
8 C:ID:a12f57ce14 M_ID_73d19e5b76
9 C:ID:88fb23a843 M_ID_a79f97e414
10 C:ID:8ef5e8c067 M_ID_20599284b1
# ... with 185,119 more rows
最后
相信写到这里大家已经基本理解了stringr包的功能与用法,数据清洗作为特征工程中重要的一环,熟练掌握数据表内的字符串的增删查改功能才能为数据分析提供最基本的支撑。
·END·
R语言与数据分析
生产力干货