【R】stringr - 玩转字符串 by 数据框

ad82ac62fee284753751e65523f23a5f.jpeg

前言

    这篇文章中,我们汇总了stringr包对数据框内的字符串的各种操作,其中包括:存在检测;字符提取;更改替换;分列&合并。

    stringr官方文档多是以向量对象为参考。而【Tidyverse For Dataframe】系列文章中:

    原则上,我们希望做到所有操作的结果全部依赖data.frame对象,不溢出产生其他类型。

    固,这篇文章内使用dplyr语句对stringr函数做嵌套,使后者的结果直接赋值到dataframe对象中,方便读者在实际生产中直接进行表操作。

    dplyr函数操作详见:【R】dplyr - 数据处理瑞士军刀

    管道操作符详见:【R】magrittr - 4种管道操作符活用教程

数据准备

pacman::p_load(data.table, tidyverse) #也可以直接library
data <- fread("D://contest//transactions.csv")


#将数据框转换为tibble形式数据框,为dplyr函数加快处理速度
#提取card_id, merchant_id两列,重新赋值为data
data <- data %>% 
  tbl_df() %>% 
  select(card_id, merchant_id)


> data


# A tibble: 185,129 x 2
   card_id         merchant_id    
   <chr>           <chr>          
 1 C_ID_cb34e13a1e M_ID_f162748793
 2 C_ID_8a1184a21e M_ID_00c57eaac8
 3 C_ID_69ed7c9177 M_ID_1da88fbd75
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6
 5 C_ID_07b2107875 M_ID_7291e8adc1
 6 C_ID_34521fce1b M_ID_0649e6e556
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8
 8 C_ID_a12f57ce14 M_ID_73d19e5b76
 9 C_ID_88fb23a843 M_ID_a79f97e414
10 C_ID_8ef5e8c067 M_ID_20599284b1
# ... with 185,119 more rows

我们以一个kaggle上的信用卡消费数据集为例

百度网盘下载:

链接:https://pan.baidu.com/s/1Qv3nAJxfo7hxjdTOGoLOXA

提取码:f4ks

存在检测

1

筛选包含 - str_detect

7be4ef3cf3f95ebfcabd4bc169ff4429.png

#筛选card_id列包含“a2”字符的行

data %>% 
  filter(str_detect(card_id, "a2"))
  
  # A tibble: 6,484 x 2
   card_id         merchant_id    
   <chr>           <chr>          
 1 C_ID_8a1184a21e M_ID_00c57eaac8
 2 C_ID_9b46dc1aa2 M_ID_b3372d49d4
 3 C_ID_17a220c9f8 M_ID_e8b5c4cefd
 4 C_ID_e1ae6b02a2 M_ID_11199ebd64
 5 C_ID_ca2e8daf18 M_ID_de95a7ddcb
 6 C_ID_667849a27d M_ID_ad2f6b1db9
 7 C_ID_e13280dca2 M_ID_a870db5f63
 8 C_ID_d2a2842aa6 M_ID_54aa7a0ad8
 9 C_ID_36988f4aa2 M_ID_fa90130a67
10 C_ID_de0a294c48 M_ID_7f48fe6e0d
# ... with 6,474 more rows

2

出现次数 - str_count

ded45a9486fbbd1064cde461a49d1cd6.png

#统计card_id列中每行“a”字符出现的次数,赋值到新列a_count中

data %>% 
  mutate(a_count = str_count(card_id, "a"))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     a_count
   <chr>           <chr>             <int>
 1 C_ID_cb34e13a1e M_ID_f162748793       1
 2 C_ID_8a1184a21e M_ID_00c57eaac8       2
 3 C_ID_69ed7c9177 M_ID_1da88fbd75       0
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6       1
 5 C_ID_07b2107875 M_ID_7291e8adc1       0
 6 C_ID_34521fce1b M_ID_0649e6e556       0
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8       0
 8 C_ID_a12f57ce14 M_ID_73d19e5b76       1
 9 C_ID_88fb23a843 M_ID_a79f97e414       1
10 C_ID_8ef5e8c067 M_ID_20599284b1       0
# ... with 185,119 more rows

3

提取存在 - str_match

0ecef8d1b4079e8b66abd7c1229f51c4.png

#包含a1则显示a1,否则显示NA,赋值到新列a_count中

data %>% 
  mutate(ID = str_match(card_id, "a1"))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     ID[,1]
   <chr>           <chr>           <chr> 
 1 C_ID_cb34e13a1e M_ID_f162748793 a1    
 2 C_ID_8a1184a21e M_ID_00c57eaac8 a1    
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 NA    
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 NA    
 5 C_ID_07b2107875 M_ID_7291e8adc1 NA    
 6 C_ID_34521fce1b M_ID_0649e6e556 NA    
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 NA    
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 a1    
 9 C_ID_88fb23a843 M_ID_a79f97e414 NA    
10 C_ID_8ef5e8c067 M_ID_20599284b1 NA    
# ... with 185,119 more rows

4

存在示位 - str_locate

7c388937f29b14e7b624a6cee93c66c1.png

#查看字符a1在card_id列中的存在与否,和起始、结束位置;起始位置赋值到start列,结束位置赋值到end列

data %>% 
  mutate(start = str_locate(card_id, "a1")[, 1]) %>% 
  mutate(end = str_locate(card_id, "a1")[, 2])
  
  # A tibble: 185,129 x 4
   card_id         merchant_id     start   end
   <chr>           <chr>           <int> <int>
 1 C_ID_cb34e13a1e M_ID_f162748793    13    14
 2 C_ID_8a1184a21e M_ID_00c57eaac8     7     8
 3 C_ID_69ed7c9177 M_ID_1da88fbd75    NA    NA
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6    NA    NA
 5 C_ID_07b2107875 M_ID_7291e8adc1    NA    NA
 6 C_ID_34521fce1b M_ID_0649e6e556    NA    NA
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8    NA    NA
 8 C_ID_a12f57ce14 M_ID_73d19e5b76     6     7
 9 C_ID_88fb23a843 M_ID_a79f97e414    NA    NA
10 C_ID_8ef5e8c067 M_ID_20599284b1    NA    NA
# ... with 185,119 more rows

5

长度计算 - str_length

8622be0ca7d7b83e914f900bb4ff4ce1.png

##查看card_id列中每行字符串对应的长度,赋值到lenth列

data %>% 
  mutate(lenth = str_length(card_id))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     lenth
   <chr>           <chr>           <int>
 1 C_ID_cb34e13a1e M_ID_f162748793    15
 2 C_ID_8a1184a21e M_ID_00c57eaac8    15
 3 C_ID_69ed7c9177 M_ID_1da88fbd75    15
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6    15
 5 C_ID_07b2107875 M_ID_7291e8adc1    15
 6 C_ID_34521fce1b M_ID_0649e6e556    15
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8    15
 8 C_ID_a12f57ce14 M_ID_73d19e5b76    15
 9 C_ID_88fb23a843 M_ID_a79f97e414    15
10 C_ID_8ef5e8c067 M_ID_20599284b1    15
# ... with 185,119 more rows

字符提取

1

位置截取 - str_sub

6dd57332f5d1903a41f810f960274816.png

#截取card_id列1-6位字符,赋值到新建列first_id

data %>% 
  mutate(first_id = str_sub(card_id, 1, 6))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     first_id
   <chr>           <chr>           <chr>   
 1 C_ID_cb34e13a1e M_ID_f162748793 C_ID_c  
 2 C_ID_8a1184a21e M_ID_00c57eaac8 C_ID_8  
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_ID_6  
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_ID_a  
 5 C_ID_07b2107875 M_ID_7291e8adc1 C_ID_0  
 6 C_ID_34521fce1b M_ID_0649e6e556 C_ID_3  
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_ID_5  
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_ID_a  
 9 C_ID_88fb23a843 M_ID_a79f97e414 C_ID_8  
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_ID_8  
# ... with 185,119 more rows

#可以指设置初始位置或结束位置

data %>% 
  mutate(first_id = str_sub(card_id, 6)) #从第6个字符开始截取,包含第6个字符
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     first_id  
   <chr>           <chr>           <chr>     
 1 C_ID_cb34e13a1e M_ID_f162748793 cb34e13a1e
 2 C_ID_8a1184a21e M_ID_00c57eaac8 8a1184a21e
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 69ed7c9177
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 ae6f8dd403
 5 C_ID_07b2107875 M_ID_7291e8adc1 07b2107875
 6 C_ID_34521fce1b M_ID_0649e6e556 34521fce1b
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 5d09feb8b0
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 a12f57ce14
 9 C_ID_88fb23a843 M_ID_a79f97e414 88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 8ef5e8c067
# ... with 185,119 more rows

#也可以设置成负数,表示从倒数第几个字符开始截取

data %>% 
  mutate(first_id = str_sub(card_id, -5))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     first_id
   <chr>           <chr>           <chr>   
 1 C_ID_cb34e13a1e M_ID_f162748793 13a1e   
 2 C_ID_8a1184a21e M_ID_00c57eaac8 4a21e   
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 c9177   
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 dd403   
 5 C_ID_07b2107875 M_ID_7291e8adc1 07875   
 6 C_ID_34521fce1b M_ID_0649e6e556 fce1b   
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 eb8b0   
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 7ce14   
 9 C_ID_88fb23a843 M_ID_a79f97e414 3a843   
10 C_ID_8ef5e8c067 M_ID_20599284b1 8c067   
# ... with 185,119 more rows

2

首配提取 - str_extract

5b8d0be28d45ec4a126a4c847571bd5a.png

#使用正则表达式,提取第一个出现的数字

data %>% 
  mutate(number = str_extract(card_id, '[0-9]'))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     number
   <chr>           <chr>           <chr> 
 1 C_ID_cb34e13a1e M_ID_f162748793 3     
 2 C_ID_8a1184a21e M_ID_00c57eaac8 8     
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 6     
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 6     
 5 C_ID_07b2107875 M_ID_7291e8adc1 0     
 6 C_ID_34521fce1b M_ID_0649e6e556 3     
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 5     
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 1     
 9 C_ID_88fb23a843 M_ID_a79f97e414 8     
10 C_ID_8ef5e8c067 M_ID_20599284b1 8     
# ... with 185,119 more rows

更改替换

1

首配替换 - str_replace

ea46ce72fea45ae477f2c4afab1571bb.png

#将card_id列中第一个C_ID字符替换为card,赋值到新建列ID中

data %>% 
  mutate(ID = str_replace(card_id, "C_ID", "card"))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     ID             
   <chr>           <chr>           <chr>          
 1 C_ID_cb34e13a1e M_ID_f162748793 card_cb34e13a1e
 2 C_ID_8a1184a21e M_ID_00c57eaac8 card_8a1184a21e
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 card_69ed7c9177
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 card_ae6f8dd403
 5 C_ID_07b2107875 M_ID_7291e8adc1 card_07b2107875
 6 C_ID_34521fce1b M_ID_0649e6e556 card_34521fce1b
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 card_5d09feb8b0
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 card_a12f57ce14
 9 C_ID_88fb23a843 M_ID_a79f97e414 card_88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 card_8ef5e8c067
# ... with 185,119 more rows

2

全部替换 - str_replace_all

21910028824dcfd7fd73496460dfd55f.png

#将card_id列中每行所有的a字符替换为Z,赋值到新建列ID中

data %>% 
  mutate(ID = str_replace_all(card_id, "a", "Z"))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     ID             
   <chr>           <chr>           <chr>          
 1 C_ID_cb34e13a1e M_ID_f162748793 C_ID_cb34e13Z1e
 2 C_ID_8a1184a21e M_ID_00c57eaac8 C_ID_8Z1184Z21e
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_ID_69ed7c9177
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_ID_Ze6f8dd403
 5 C_ID_07b2107875 M_ID_7291e8adc1 C_ID_07b2107875
 6 C_ID_34521fce1b M_ID_0649e6e556 C_ID_34521fce1b
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_ID_5d09feb8b0
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_ID_Z12f57ce14
 9 C_ID_88fb23a843 M_ID_a79f97e414 C_ID_88fb23Z843
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_ID_8ef5e8c067
# ... with 185,119 more rows

3

全部大写 - str_to_upper

30a38461506ff4bb20affa55f5c68472.png

#将card_id列转换为大写,赋值到新建列ID中

data %>% 
  mutate(ID = str_to_upper(card_id))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     ID             
   <chr>           <chr>           <chr>          
 1 C_ID_cb34e13a1e M_ID_f162748793 C_ID_CB34E13A1E
 2 C_ID_8a1184a21e M_ID_00c57eaac8 C_ID_8A1184A21E
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_ID_69ED7C9177
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_ID_AE6F8DD403
 5 C_ID_07b2107875 M_ID_7291e8adc1 C_ID_07B2107875
 6 C_ID_34521fce1b M_ID_0649e6e556 C_ID_34521FCE1B
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_ID_5D09FEB8B0
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_ID_A12F57CE14
 9 C_ID_88fb23a843 M_ID_a79f97e414 C_ID_88FB23A843
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_ID_8EF5E8C067
# ... with 185,119 more rows

4

全部小写 - str_to_lower

04845a60c8fccd421180c00840206c93.png

#将card_id列转换为小写,赋值到新建列ID中

data %>% 
  mutate(ID = str_to_lower(card_id))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     ID             
   <chr>           <chr>           <chr>          
 1 C_ID_cb34e13a1e M_ID_f162748793 c_id_cb34e13a1e
 2 C_ID_8a1184a21e M_ID_00c57eaac8 c_id_8a1184a21e
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 c_id_69ed7c9177
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 c_id_ae6f8dd403
 5 C_ID_07b2107875 M_ID_7291e8adc1 c_id_07b2107875
 6 C_ID_34521fce1b M_ID_0649e6e556 c_id_34521fce1b
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 c_id_5d09feb8b0
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 c_id_a12f57ce14
 9 C_ID_88fb23a843 M_ID_a79f97e414 c_id_88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 c_id_8ef5e8c067
# ... with 185,119 more rows

5

首字大写 - str_to_title

0ef5eddffa3701ee1fcca95234df6877.png

#将card_id列转换为单词首字母大写(以空格为区分),赋值到新建列ID中

data %>% 
  mutate(ID = str_to_title(card_id))
  
  # A tibble: 185,129 x 3
   card_id         merchant_id     ID             
   <chr>           <chr>           <chr>          
 1 C_ID_cb34e13a1e M_ID_f162748793 C_id_cb34e13a1e
 2 C_ID_8a1184a21e M_ID_00c57eaac8 C_id_8a1184a21e
 3 C_ID_69ed7c9177 M_ID_1da88fbd75 C_id_69ed7c9177
 4 C_ID_ae6f8dd403 M_ID_ebbdb42da6 C_id_ae6f8dd403
 5 C_ID_07b2107875 M_ID_7291e8adc1 C_id_07b2107875
 6 C_ID_34521fce1b M_ID_0649e6e556 C_id_34521fce1b
 7 C_ID_5d09feb8b0 M_ID_b794b9d9e8 C_id_5d09feb8b0
 8 C_ID_a12f57ce14 M_ID_73d19e5b76 C_id_a12f57ce14
 9 C_ID_88fb23a843 M_ID_a79f97e414 C_id_88fb23a843
10 C_ID_8ef5e8c067 M_ID_20599284b1 C_id_8ef5e8c067
# ... with 185,119 more rows

分列&合并

1

变量分列 - separate(tidyr包)

#将card_id列以"_"字符分列为"noneed", "noneed2", "id"三个列,新数据框赋值到category_sep

data %>% 
  separate(card_id, sep = "_", into = c("noneed", "noneed2", "id")) -> category_sep
  
  > category_sep
  
# A tibble: 185,129 x 4
   noneed noneed2 id         merchant_id    
   <chr>  <chr>   <chr>      <chr>          
 1 C      ID      cb34e13a1e M_ID_f162748793
 2 C      ID      8a1184a21e M_ID_00c57eaac8
 3 C      ID      69ed7c9177 M_ID_1da88fbd75
 4 C      ID      ae6f8dd403 M_ID_ebbdb42da6
 5 C      ID      07b2107875 M_ID_7291e8adc1
 6 C      ID      34521fce1b M_ID_0649e6e556
 7 C      ID      5d09feb8b0 M_ID_b794b9d9e8
 8 C      ID      a12f57ce14 M_ID_73d19e5b76
 9 C      ID      88fb23a843 M_ID_a79f97e414
10 C      ID      8ef5e8c067 M_ID_20599284b1
# ... with 185,119 more rows

2

变量合并 - unite(tidyr包)

#将"noneed", "noneed2", "id"三个列重新合并为ID单列,连接符为":"

category_sep %>% 
  unite(noneed, noneed2, id, col = "ID", sep = ":")
  
  # A tibble: 185,129 x 2
   ID              merchant_id    
   <chr>           <chr>          
 1 C:ID:cb34e13a1e M_ID_f162748793
 2 C:ID:8a1184a21e M_ID_00c57eaac8
 3 C:ID:69ed7c9177 M_ID_1da88fbd75
 4 C:ID:ae6f8dd403 M_ID_ebbdb42da6
 5 C:ID:07b2107875 M_ID_7291e8adc1
 6 C:ID:34521fce1b M_ID_0649e6e556
 7 C:ID:5d09feb8b0 M_ID_b794b9d9e8
 8 C:ID:a12f57ce14 M_ID_73d19e5b76
 9 C:ID:88fb23a843 M_ID_a79f97e414
10 C:ID:8ef5e8c067 M_ID_20599284b1
# ... with 185,119 more rows

最后

    相信写到这里大家已经基本理解了stringr包的功能与用法,数据清洗作为特征工程中重要的一环,熟练掌握数据表内的字符串的增删查改功能才能为数据分析提供最基本的支撑。

·END·

R语言与数据分析

生产力干货

f74b46d751fa10f929d918e98b8f0cca.jpeg

微信号:RforData
  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值