R语言做文本挖掘：基于网购评论提炼电热水器的差异化卖点_r语言基于京东评论数据情感文本挖掘-CSDN博客

本文链接：https://blog.csdn.net/qq_25684755/article/details/46390763

本文利用R语言进行文本挖掘，通过对网购电热水器评论的分析，揭示产品的差异化卖点。虽然作品存在改进空间，但展示了数据挖掘的过程，可供参考讨论。

摘要由CSDN通过智能技术生成

这是我参加一个数据挖掘竞赛的作品，这是代码部分，论文正文并没有贴出来。
水平一般般，很多还可以改进的地方。
不过辛辛苦苦做出来的东西，多少还是有些地方可以借鉴的，贴出来大家共同探讨下。

#读入数据
guomei = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-国美.csv",stringsAsFactors=F,header=T)
jingdong = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-京东.csv",stringsAsFactors=F,header=T)
suning = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-苏宁.csv",stringsAsFactors=F,header=T)
tianmao = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-天猫淘宝.csv",stringsAsFactors=F,header=T)
yixun = read.csv("C:\\Users\\hormy\\Desktop\\电热评论原始数据\\汇总-易迅.csv",stringsAsFactors=F,header=T)

#去掉不必要的列,统一列名
guomei = guomei[,4:5]
names(guomei)= c('品牌','评论')
jingdong = jingdong[,5:6]
names(jingdong)= c('品牌','评论')
suning = suning[,c(5,7)]
names(suning) = c('品牌','评论')
tianmao = tianmao[,5:6]
names(tianmao) = c('品牌','评论')
yixun = yixun[,c(5,7)]
names(yixun) = c('品牌','评论')

#去掉默认好评，和空白评论
guomei = guomei[guomei$评论!="未及时做出评论，默认好评！",]
tianmao = tianmao[complete.cases(tianmao$评论),]

#筛选出海尔的评论数据
haier = c(guomei$评论,jingdong[jingdong$品牌=="海尔",]$评论,suning[suning$品牌=="海尔",]$评论,
          tianmao[tianmao$品牌=="海尔",]$评论,yixun[yixun$品牌=="海尔",]$评论)

#除海尔外的所有品牌评论数据
others = c(jingdong[jingdong$品牌!="海尔",]$评论,suning[suning$品牌!="海尔",]$评论,
           tianmao[tianmao$品牌!="海尔",]$评论,yixun[yixun$品牌!="海尔",]$评论)

#去掉数字，字母
haier = gsub("[a-z0-9A-Z_]","",haier)
others = gsub("[a-z0-9A-Z_]","",others)


#基于中科院ICTCLAS的分词包
library(Rwordseg)

#加入自定义的词典后，对海尔的评论分词
haier = segmentCN(haier,nature=TRUE)
#对其他品牌的评论分词
others = segmentCN(others,nature=TRUE)

rm(guomei,tianmao,jingdong,suning,yixun)

#去停用词
#生成自定义停词表stopwordsCN.txt，读入,必须是utf-8编码
stopwordsCN = as.character(readLines("stopwordsCN.txt"))
stopwordsCN = enc2utf8(stopwordsCN)
stopwordsCN = stopwordsCN[Encoding(stopwordsCN)!="unknown"]

#自定义去停词函数
removeStopWords <- function(x,stopwords) {
  
  temp <- character(0)
  
  index <- 1
  
  xLen <- length(x)
  
  while (index <= xLen) {
    
    if (length(stopwords[stopwords==x[index]]) <1)
      
      temp<- c(temp,x[index])
    
    index <- index +1
    
  }
  
  temp
  
}

#去停词
haier = lapply(haier,removeStopWords,stopwordsCN)
others = lapply(others,removeStopWords,stopwordsCN)


#提取海尔的名词和动名词，作为候选产品特征集
haier.vc = unlist(haier)
haier.character = c(haier.vc[grep("n",names(haier.vc))],haier.vc[grep("vn",names(haier.vc))])
haier.character = haier.character[nchar(haier.character)>1]

#编写函数提取其他品牌的名词和动名词，作为候选产品特征集
others.vc = unlist(others)
others.character = c(others.vc[grep("n",names(others.vc))],others.vc[grep("vn",names(others.vc))])
others.character = others.character[nchar(others.character)>1]


#特征提取,初始特征集
#提取候选特征中的出现次数大于100次的
haier.character = sort(table(haier.character))
haier.character = haier.character[as.vector(haier.character) >= 100]

others.character = sort(table(others.character))
others.character = others.character[as.vector(others.character) >= 100]


#根据候选产品特征集，计算频率并用词云图可视化显示
library(wordcloud) 

windows() #打开一个plot新界面显示词云图
wordcloud(names(haier.character),as.vector(haier.character),colors=brewer.pal(8,"Dark2"))  #选取画出出现次数大于500的词语

windows()
wordcloud(names(others.character),as.vector(others.character),colors=brewer.pal(8,"Dark2"),
          min.freq=500*3535/2730)


#手工去除非特征名词，得到特征集9个
waiguan = c('造型','外表','外包装','样式','外形','款式','样子','设计','外观','外壳','包装')
peijian = c('电源线','接口','旋钮','管线','阀门','水龙头','面板','管道','架子','遥控器','接头','螺丝','软管','