R语言中str_extract_all函数

这个函数是在stringr包下面的一个函数,在做数据清洗的时候还是很有用的,大概用法就是去提取一个字符串下的某种内容,按照一些自己想要的规则,具体用法如下:

x<-"abacdef12g"

 str_extract_all(x,"[f0-9]")
[[1]]
[1] "f" "1" "2"

> str_extract_all(x,"[f0-9]{1,3}")
[[1]]
[1] "f12"

> str_extract_all(x,"[f0-9]{1,2}")
[[1]]
[1] "f1" "2" 

附上一些平时写的代码

library(ggplot2)
library(RMySQL)
library(stringr)
library(sqldf)
library(plyr)
conn <- dbConnect(MySQL(), dbname = "tracker", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)
dbSendQuery(conn,'SET NAMES utf8')
query<-dbSendQuery(conn, "SELECT key_table,left(insert_time,8) as 
insert_date,label,sessionid,stay_time,site,page_url FROM tracker.hbase_visit
where insert_time is not null and page_url like 'https://item.zhong.com%' ")
rawdata_vi <- fetch(query,n=-1)

dbDisconnect(conn)
dim(rawdata_vi)
head(rawdata_vi)
nrow(rawdata_vi)
# rawdata_vi$prodID=as.numeric(unlist(str_extract_all(rawdata_vi$page_url,"[0-9]{1,2}")))
f<-function(x){
  if(grepl("productId",x)){
    result<-as.numeric(unlist(str_extract_all(x,"[0-9]{1,2}"))[1])
    
  else{
    result<-9999
  }
  result
}
rawdata_vi$prodID =sapply(rawdata_vi$page_url,f)
 rawdata_vi_prod<-subset(rawdata_vi,prodID!=9999)

head(rawdata_vi_prod)
dim(rawdata_vi_prod)

##############  上面是得到了每个url的产品ID
#### 下面的code 要得到每个产品的属性
conn <- dbConnect(MySQL(), dbname = "pms", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)
dbSendQuery(conn,'SET NAMES utf8')
query<-dbSendQuery(conn, "select a.*,b.BizCategoryName from
 pms.pms_product a left join pms.pms_biz_category b
on a.ProductType = b.ID ")
rawdata_pd <- fetch(query,n=-1)
dbDisconnect(conn)

head(rawdata_pd)
#######  

##### 每天的总结
prod_sumy1=  rawdata_vi_prod[,c(2,8)]
prod_sumy2 = count(prod_sumy1, c("insert_date", "prodID"))
prod_sumy3 = count(prod_sumy1, "prodID")

head(prod_sumy2)

prod1 = merge(prod_sumy2,rawdata_pd[,c(1,4,5,31)],by.x = "prodID", by.y = "ID",all.x=T)
head(prod1)

daily_prod_rank =  prod1[with(prod1, order(-as.numeric(insert_date),-freq)),]
names(daily_prod_rank)[c(1,2,3)] <- c("ProdId","InsertDate","PVCnt")
head(daily_prod_rank)

daily_prod_rank$InsertDate = as.character(daily_prod_rank$InsertDate)


### 导入到数据库
conn <- dbConnect(MySQL(), dbname = "analyse", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)
dbWriteTable(conn, "daily_prod_rank_raw", prod_sumy2)



### 产品1.关注,2.购买,3.预约,4.点赞
conn <- dbConnect(MySQL(), dbname = "pms", username="gaoyang922", password="gaoyang922@123456!",host="10.10.109.62",port=1333)
query<-dbSendQuery(conn, "SELECT productID,RelationType,
(case when RelationType=1 then '关注'
when  RelationType=2 then '购买'
                   when RelationType=4 then '点赞'
                   else '预约' end
)as RT_desc
                   ,count(*) as pd_cnt FROM pms.pms_user_relation
                  where CreateTime between '2015-08-18' and '2015-09-05'
                   group by ProductId,RelationType ")
rawdata_RT <- fetch(query,n=-1)
dbDisconnect(conn)

head(rawdata_RT)

table(prodID)
prodID_ggplot<-rawdata_vi$prodID;prodID_ggplot<-reorder(prodID_ggplot,prodID_ggplot,length)
rawdata_vi$prodID_ggplot<-prodIDe_ggplot
ggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot))+geom_bar()


# 
# library(ggplot2)
# library(RMySQL)
# library(stringr)
# library(dplyr)
# conn <- dbConnect(MySQL(), dbname = "tracker", username="zhoumeixu204", password="zhoumeixu204@123456!",host="10.10.109.62",port=1333)
# query<-dbSendQuery(conn, "SELECT key_table,left(insert_time,8) as 
#                    insert_date,label,sessionid,stay_time,site,page_url FROM tracker.hbase_visit
#                    where insert_time is not null  ")
# 
# 
# query_1<-dbSendQuery(conn,"
#                      select * from  tracker.hbase_visitor   where  insert_time  
#                      is not   NULL  and  city is not NUll   and country='china'")
# rawdata_vi <- fetch(query,n=-1)
# hbase_visitor<-fetch(query_1,n=-1)
# dbDisconnect(conn)
# dim(rawdata_vi)
# head(rawdata_vi)
# f<-function(x){
#   if(grepl("productId",x)){
#     result<-as.numeric(unlist(str_extract_all(x,"[0-9]{1,2}"))[1])
#     
#   }
#   else{
#     result<-9999
#   }
#   result
# }
# rawdata_vi$prodID =sapply(rawdata_vi$page_url,f)
# # rawdata_vi<-subset(rawdata_vi,prodID!=9999)
# table(prodID)
# 
# 
# prodID_ggplot<-rawdata_vi$prodID;prodID_ggplot<-reorder(prodID_ggplot,prodID_ggplot,length)
# rawdata_vi$prodID_ggplot<-prodID_ggplot
# site_ggplot<-rawdata_vi$site;site_ggplot<-reorder(site_ggplot,site_ggplot,length)
# rawdata_vi$site_ggplot<-site_ggplot
# ggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot))+geom_bar(aes(fill=prodID_ggplot))
# ggplot(subset(rawdata_vi,prodID!=9999),aes(x=prodID_ggplot,fill=factor(insert_date)))+geom_bar(position = 'stack')+labs(title="移动终端占比柱形图")
# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'dodge')+labs(title="移动终端占比柱形图")
# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'stack')+labs(title="移动终端占比柱形图")
# ggplot(rawdata_vi,aes(x=site_ggplot,fill=factor(insert_date)))+geom_bar(position = 'dodge')+labs(title="移动终端占比柱形图")+facet_wrap(~insert_date,ncol=1)
# str(rawdata_vi)
# rawdata_vi_to_mysql<-data.frame(rawdata_vi$key_table,rawdata_vi$insert_date,rawdata_vi$label,rawdata_vi$sessionid,rawdata_vi$stay_time,rawdata_vi$site,rawdata_vi$page_url,rawdata_vi$prodID)
# conn <- dbConnect(MySQL(), dbname = "analyse_dev", username="root", password="Pa123456!",host="202.69.27.239",port=8443)
# rawdata_vi_to_mysql<-subset(rawdata_vi_to_mysql,rawdata_vi.prodID !=9999)
# dbWriteTable(conn, "rawdata_vi_to_mysql", rawdata_vi_to_mysql)
# dbDisconnect(conn)
# 
# 


  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值