用R进行电商空气净化器爬虫数据的清洗、拆分、分析

净水器爬虫数据分析请参考文章:https://blog.csdn.net/sl11u19/article/details/137905397?spm=1001.2014.3001.5502

本篇介绍空气净化器的分析,思路相似。以下是爬虫结果的预览。

第一步:多个月的爬虫数据合并、清洗

setwd("C:/Users/JD_DATA")
temp<-list.files(pattern = "*.csv")

#数据合并
data<-data.frame()
for (i in 1:24){
  test<-read.csv(temp[i])
  data<-rbind(data,test)
}

data<-data[,c(1:5,11,6:10,12:13)]
names(data)<-c("标题","价格","类别","店铺名称","商品介绍","页面网址",
                 "字段6","字段7","字段8","字段9","字段10","当前时间","文本")

library(stringr)
data<-unique(data) #去重
data$标题<-gsub('[\n\r]', '', data$标题)
data$标题<-gsub(' ', '', data$标题,fixed = T)
data$商品介绍<-gsub(' ', '', data$商品介绍,fixed = T)
data<-unique(data) #去重

#数据清理
title_delete<-c("空气开关","香氛机","焊锡","焊接",
                "养殖","食品厂","制药","无尘车间",
                "工业漆","工业废气","锅炉","工业水",
                "无尘间","烤漆房","喷漆房","熨斗",
                "锅","空调扇制冷器冷风机","泳池","水厂",
                "工厂车间","滤水壶","次氯酸","实验室",
                "二氧化氯")
data$temp<-grepl(paste(title_delete, collapse = "|"),data$标题)
data<-subset(data,data$temp==F)

brand_delete<-c("骏虹骁","赞璐桐(ZANLUTONG)","境珧",
                "宓蝶","ZTK","三康王","MATE",
                "简庭","乾越","听海","索爱",
                "艾沃得(Aiwode)","东耐伦","米皮","杰利普(Jielipu)",
                "西可微(xiker)","KMR","旦榄","诺比克(nobico)",
                "勉茂","普拉扎","爽威","绿尘","TIGER","APIXINTL",
                "OUIO","皇灯堡","杰霆","姿秒迪","温意","图姆斯",
                "听为","糖蚁","思艾格","适丰","爱尚达","OLOEY",
                "皓庭","黑桃A","安益恒辉(A)","恒佳境","皇迎",
                "华幻","萨米格","途美丝","鸥莱茵(OLEYIN)","良雫",
                "哆米多密","皮谱","弭黎","solgar","卡斯七(CARSHCH)",
                "高格")
data$temp<-grepl(paste(brand_delete, collapse = "|"),data$商品介绍)
data<-subset(data,data$temp==F)

data[,14]<-NULL #删除temp列

第二步:字段拆分、统一写法

#商品介绍
data$品牌<-str_extract_all(data$商品介绍,"(?<=品牌:).+(?=\\n)")
data$商品毛重<-str_extract_all(data$商品介绍,"(?<=商品毛重:).+(?=\\n)")
data$商品产地<-str_extract_all(data$商品介绍,"(?<=商品产地:).+(?=\\n)")
data$特色功能<-str_extract_all(data$商品介绍,"(?<=特色功能:).+(?=\\n)")
data$适用人群<-str_extract_all(data$商品介绍,"(?<=适用人群:).+(?=\\n)")
data$类型<-str_extract_all(data$商品介绍,"(?<=类型:).+(?=\\n)")
data$功能<-str_extract_all(data$商品介绍,"(?<=功能:).+(?=\\n)")
data$能效等级<-str_extract_all(data$商品介绍,"(?<=能效等级:).+(?=\\n)")
data$适用面积<-str_extract_all(data$商品介绍,"(?<=适用面积:).+(?=\\n)")


data$字段6<-gsub(' ', '', data$字段6,fixed = T)
data$字段7<-gsub(' ', '', data$字段7,fixed = T)
data$字段8<-gsub(' ', '', data$字段8,fixed = T)
data$字段9<-gsub(' ', '', data$字段9,fixed = T)
data$字段10<-gsub(' ', '', data$字段10,fixed = T)
data$字段<-paste(data$字段6,data$字段7,data$字段8,data$字段9,data$字段10,sep="\n")

#字段
data$产品尺寸<-str_extract_all(data$字段,"(?<=产品尺寸).+(?=\\n)")
data$产品净重<-str_extract_all(data$字段,"(?<=产品净重).+(?=\\n)")
data$额定电压<-str_extract_all(data$字段,"(?<=额定电压).+(?=\\n)")
data$电源线长<-str_extract_all(data$字段,"(?<=电源线长).+(?=\\n)")
data$额定功率<-str_extract_all(data$字段,"(?<=额定功率).+(?=\\n)")
data$电机类型<-str_extract_all(data$字段,"(?<=电机类型).+(?=\\n)")
data$操控方式<-str_extract_all(data$字段,"(?<=操控方式).+(?=\\n)")
data$净化方式<-str_extract_all(data$字段,"(?<=净化方式).+(?=\\n)")
data$HEPA滤网等级<-str_extract_all(data$字段,"(?<=HEPA滤网等级).+(?=\\n)")
data$最高档声功率级噪音<-str_extract_all(data$字段,"(?<=最高档声功率级噪音).+(?=\\n)")
data$最低档声功率级噪音<-str_extract_all(data$字段,"(?<=最低档声功率级噪音).+(?=\\n)")
data$颗粒物CCM<-str_extract_all(data$字段,"(?<=颗粒物CCM).+(?=\\n)")
data$固态污染物CADR<-str_extract_all(data$字段,"(?<=固态污染物CADR).+(?=\\n)")
data$气态污染物CCM<-str_extract_all(data$字段,"(?<=气态污染物CCM).+(?=\\n)")
data$气态CADR<-str_extract_all(data$字段,"(?<=气态CADR).+(?=\\n)")
data$过滤污染物类型<-str_extract_all(data$字段,"(?<=过滤污染物类型).+(?=\\n)")
data$空气质量显示<-str_extract_all(data$字段,"(?<=空气质量显示).+(?=\\n)")
data$空气质量传感器<-str_extract_all(data$字段,"(?<=空气质量传感器).+(?=\\n)")
data$国产进口<-str_extract_all(data$字段,"(?<=国产/进口).+(?=\\n)")
data$上市时间<-str_extract_all(data$字段,"(?<=\n\n\n上市时间).+(?=\\n)")

#去除无用字段
colnames(data)
data1<-data[,c(6,1,2,4,12:22,24:43)]

#统一品牌名称写法
data1$品牌<- ifelse(data1$品牌 %in% c("PHILIPS","飞利浦"), "飞利浦(PHILIPS)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("米家","米家(MIJIA)","小米"), "小米(MI)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("松下电器(panasonic)","松下电器","松下","Panasonic"), "松下(Panasonic)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("霍尼韦尔","HONEYWELL"), "霍尼韦尔(Honeywell)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("艾美特"), "艾美特(Airmate)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("奥克斯"), "奥克斯(AUX)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("美的","Midea"), "美的(Midea)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("海尔","Haier"), "海尔(Haier)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("格力"), "格力(GREE)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("亚都YADU"), "亚都(YADU)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("夏普","SHARP"), "夏普(Sharp)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("西屋","Westinghouse"), "西屋(Westinghouse)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("西门子"), "西门子(SIEMENS)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("惠而浦","惠而浦(whirpool)"), "惠而浦(whirlpool)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("東芝","東芝(TOSHIBA)","东芝"), "东芝(TOSHIBA)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("戴森","DYSON","dyson"), "戴森(DYSON)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("三菱","三菱电机(MITSUBISHIELECTRIC)"), "三菱(MITSUBISHI)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("三星"), "三星(SAMSUNG)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("摩飞","摩飞电器(MFHZPOK)"), "摩飞电器(Morphyrichards)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("华为智选","华为"), "华为(HUAWEI)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("苏泊尔"), "苏泊尔(SUPOR)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("大宇"), "大宇(DAEWOO)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("科沃斯机器人(ECOVACS)","科沃斯"), "科沃斯(ECOVACS)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("飞立"), "飞立(FEILI)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("三菱电机(MITSUBISHIELECTRIC)"), "三菱(MITSUBISHI)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("亚都"), "亚都(YADU)", data1$品牌)
data1$品牌<- ifelse(data1$品牌 %in% c("荣耀亲选","荣耀"), "荣耀(HONOR)", data1$品牌)

#品牌大写
data1$品牌<-toupper(data1$品牌)

第三步:对文本类字段进行词频分析

data<-data1
#特色功能
words<-strsplit(data$特色功能,",")
words1<-as.vector(unlist(words))
wordfreqs <- jiebaR::freq(words1)
wordfreqs <- dplyr::arrange(wordfreqs, -freq)
#适用人群
words<-strsplit(data$适用人群,",")
words1<-as.vector(unlist(words))
wordfreqs <- jiebaR::freq(words1)
wordfreqs <- dplyr::arrange(wordfreqs, -freq)
#功能
data$功能<-gsub(', ', ',', data$功能)
words<-strsplit(data$功能,",")
words1<-as.vector(unlist(words))
wordfreqs <- jiebaR::freq(words1)
wordfreqs <- dplyr::arrange(wordfreqs, -freq)
#操控方式
words<-strsplit(data$操控方式,";")
words1<-as.vector(unlist(words))
wordfreqs <- jiebaR::freq(words1)
wordfreqs <- dplyr::arrange(wordfreqs, -freq)
#空气质量显示
words<-strsplit(data$空气质量显示,";")
words1<-as.vector(unlist(words))
wordfreqs <- jiebaR::freq(words1)
wordfreqs <- dplyr::arrange(wordfreqs, -freq)
#空气质量传感器
words<-strsplit(data$空气质量传感器,";")
words1<-as.vector(unlist(words))
wordfreqs <- jiebaR::freq(words1)
wordfreqs <- dplyr::arrange(wordfreqs, -freq)

TBC

  • 9
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值