评论数据获取、词频统计、词云图

最新推荐文章于 2024-06-18 10:58:00 发布

qianjinwang

最新推荐文章于 2024-06-18 10:58:00 发布

阅读量1.7k

点赞数 1

文章标签：数据挖掘

本文链接：https://blog.csdn.net/qianjin_w/article/details/105109842

版权

# coding: utf-8

# In[2]:

import urllib.request
import re
import requests
import time
import random
import json

# 设置请求头
headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)

# 获取URL
url = 'https://sclub.jd.com/comment/productPageComments.action?callback=&productId=100002749549&score=0&sortType=5&pageSize=10&isShadowSku=0&fold=1'

f = open('E:/comments/华为P30.txt', 'w', encoding='utf-8')
for i in range(0, 20):
t = str(time.time()*1000).split('.')
pagram = {
'page': i+1,
'callback': 'fetchJSON_comment98vv4092%s' % (int(t[1])+1)
}
# print(pagram)
# 随机休眠行为分析
time.sleep(random.random())

# 发送http请求
response = requests.get(url, params=pagram)
# 入库，文件
data = response.text
# 解析数据
data = re.findall(r'{.*}', data)[0]
# 格式成字典
data = json.loads(data)
data = data['comments']
comment_data = {}
for item in data:
comment_data['手机型号'] = item['referenceName']
comment_data['昵称'] = item['nickname']
comment_data['评论内容'] = item['content']
f.write('手机型号：'+item['referenceName']+'\n'+'昵称：'+item['nickname']+'\n'+'评论内容：'+item['content']+'\n')
f.close()

# In[12]:

import jieba

# 评论内容进行去重
def quchong(infile, outfile):
infopen = open(infile, 'r', encoding='utf-8')
outopen = open(outfile, 'w', encoding='utf-8')
lines = infopen.readlines()
list_1 = []
for line in lines:
if line not in list_1:
list_1.append(line)
outopen.write(line)
infopen.close()
outopen.close()

quchong("E:/comments/华为P30.txt", "E:/comments/P30去重.txt")

# jieba.load_userdict('userdict.txt')
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords

# 对评论内容进行分词
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('E:/comments/cn_stopwords.txt') # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr

inputs = open('E:/comments/P30去重.txt', 'r', encoding='utf-8')
outputs = open('E:/comments/P30分词.txt', 'w')
for line in inputs:
line_seg = seg_sentence(line) # 这里的返回值是字符串
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
print('分词完毕')

# In[14]:

# 词频统计
import jieba.analyse
from collections import Counter # 词频统计

with open('E:/comments/P30分词.txt', 'r', encoding='utf-8') as fr:
data = jieba.cut(fr.read())
data = dict(Counter(data))

with open('E:/comments/P30词频.txt', 'w', encoding='utf-8') as fw: # 读入存储wordcount的文件路径
for k, v in data.items():
fw.write('%s, %d\n' % (k, v))

# In[18]:

import jieba.analyse
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 生成词云
with open('E:/comments/P30词频.txt',encoding='utf-8') as f:
# 提取关键词
data = f.read()
keyword = jieba.analyse.extract_tags(data, topK=50, withWeight=False)
wl = " ".join(keyword)

# 设置词云
wc = WordCloud(
# 设置背景颜色
background_color = "white",
# 设置最大显示的词云数
max_words=2000,
# 这种字体都在电脑字体中，一般路径
font_path='C:/Windows/Fonts/simfang.ttf',
height=1200,
width=1600,
# 设置字体最大值
max_font_size=100,
# 设置有多少种随机生成状态，即有多少种配色方案
random_state=30,
)

myword = wc.generate(wl) # 生成词云
# 展示词云图
plt.imshow(myword)
plt.axis("off")
plt.show()
wc.to_file('E:/comments/P30.png') # 把词云保存下