这是我参加一个数据挖掘竞赛的作品,这是代码部分,论文正文并没有贴出来。
水平一般般,很多还可以改进的地方。
不过辛辛苦苦做出来的东西,多少还是有些地方可以借鉴的,贴出来大家共同探讨下。
#读入数据
guomei = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-国美.csv",stringsAsFactors=F,header=T)
jingdong = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-京东.csv",stringsAsFactors=F,header=T)
suning = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-苏宁.csv",stringsAsFactors=F,header=T)
tianmao = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-天猫淘宝.csv",stringsAsFactors=F,header=T)
yixun = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-易迅.csv",stringsAsFactors=F,header=T)
#去掉不必要的列,统一列名
guomei = guomei[,4:5]
names(guomei)= c('品牌','评论')
jingdong = jingdong[,5:6]
names(jingdong)= c('品牌','评论')
suning = suning[,c(5,7)]
names(suning) = c('品牌','评论')
tianmao = tianmao[,5:6]
names(tianmao) = c('品牌','评论')
yixun = yixun[,c(5,7)]
names(yixun) = c('品牌','评论')
#去掉默认好评,和空白评论
guomei = guomei[guomei$评论!="未及时做出评论,默认好评!",]
tianmao = tianmao[complete.cases(tianmao$评论),]
#筛选出海尔的评论数据
haier = c(guomei$评论,jingdong[jingdong$品牌=="海尔",]$评论,suning[suning$品牌=="海尔",]$评论,
tianmao[tianmao$品牌=="海尔",]$评论,yixun[yixun$品牌=="海尔",]$评论)
#除海尔外的所有品牌评论数据
others = c(jingdong[jingdong$品牌!="海尔",]$评论,suning[suning$品牌!="海尔",]$评论,
tianmao[tianmao$品牌!="海尔",]$评论,yixun[yixun$品牌!="海尔",]$评论)
#去掉数字,字母
haier = gsub("[a-z0-9A-Z_]","",haier)
others = gsub("[a-z0-9A-Z_]","",others)
#基于中科院ICTCLAS的分词包
library(Rwordseg)
#加入自定义的词典后,对海尔的评论分词
haier = segmentCN(haier,nature=TRUE)
#对其他品牌的评论分词
others = segmentCN(others,nature=TRUE)
rm(guomei,tianmao,jingdong,suning,yixun)
#去停用词
#生成自定义停词表stopwordsCN.txt,读入,必须是utf-8编码
stopwordsCN = as.character(readLines("stopwordsCN.txt"))
stopwordsCN = enc2utf8(stopwordsCN)
stopwordsCN = stopwordsCN[Encoding(stopwordsCN)!="unknown"]
#自定义去停词函数
removeStopWords <- function(x,stopwords) {
temp <- character(0)
index <- 1
xLen <- length(x)
while (index <= xLen) {
if (length(stopwords[stopwords==x[index]]) <1)
temp<- c(temp,x[index])
index <- index +1
}
temp
}
#去停词
haier = lapply(haier,removeStopWords,stopwordsCN)
others = lapply(others,removeStopWords,stopwordsCN)
#提取海尔的名词和动名词,作为候选产品特征集
haier.vc = unlist(haier)
haier.character = c(haier.vc[grep("n",names(haier.vc))],haier.vc[grep("vn",names(haier.vc))])
haier.character = haier.character[nchar(haier.character)>1]
#编写函数提取其他品牌的名词和动名词,作为候选产品特征集
others.vc = unlist(others)
others.character = c(others.vc[grep("n",names(others.vc))],others.vc[grep("vn",names(others.vc))])
others.character = others.character[nchar(others.character)>1]
#特征提取,初始特征集
#提取候选特征中的出现次数大于100次的
haier.character = sort(table(haier.character))
haier.character = haier.character[as.vector(haier.character) >= 100]
others.character = sort(table(others.character))
others.character = others.character[as.vector(others.character) >= 100]
#根据候选产品特征集,计算频率并用词云图可视化显示
library(wordcloud)
windows() #打开一个plot新界面显示词云图
wordcloud(names(haier.character),as.vector(haier.character),colors=brewer.pal(8,"Dark2")) #选取画出出现次数大于500的词语
windows()
wordcloud(names(others.character),as.vector(others.character),colors=brewer.pal(8,"Dark2"),
min.freq=500*3535/2730)
#手工去除非特征名词,得到特征集9个
waiguan = c('造型','外表','外包装','样式','外形','款式','样子','设计','外观','外壳','包装')
peijian = c('电源线','接口','旋钮','管线','阀门','水龙头','面板','管道','架子','遥控器','接头','螺丝','软管','