实验环境
版本:Python 3.7
依赖:requests , json , time , os , jieba.posseg , wordcloud ,matplotlib
安装过程这里不再赘述
流程说明
1、 获取需要分析的url,进行后面的操作
2、 根据该url进行数据抓取,当前口碑是分页加载,需要多次请求才能获得完整数据
3、 将获得的口碑数据进行文件写入,为后续分析提供数据基础
4、 提供两种方式进行词云展示,第一种是generate,使用全部数据进行词云绘制,第二种是generate_from_frequencies。自行使用jieba分词进行词云的绘制
5、 结果展示,资料下载
6、 TODO
数据链接入口地址抓取
以轩逸为例,在懂车帝移动端打开页面https://m.dcdapp.com/motor/m/car_series/index?series_id=1145&zt=pc_redirect,进行分析,可查看到如下界面
将该请求地址记录下来,即为入口地址
代码案例
import requests
import json
import time
import os
import jieba.posseg as psg
from wordcloud import WordCloud
import matplotlib.pyplot as plt #绘制图像的模块
keywords = {}
def writeFile(line):
fo = open("target.txt", "a+")
fo.write(line+"\n")
fo.close()
def getKeyWords(data):
lines = []
for item in data:
if 'info' in item.keys():
info = item['info']
if 'comment_list' in info.keys():
comment_list = info['comment_list']
content = ''
if 'content' in info.keys():
content = info['content']
lines.append(content)
if isinstance(comment_list, list) is False:
continue
for comment in comment_list:
text = comment['text']
lines.append(text)
reply_list = comment['reply_list']
for reply in reply_list:
reply_text = reply['text']
lines.append(reply_text)
for line in lines:
writeFile(line)
return lines
def getStartMaxCursor(data):
cursor = 0
for item in data:
if 'info' in item.keys():
info = item['info']
if 'cursor' in info.keys():
cursor = info['cursor']
return cursor
def getComments(max_cursor=0):
if max_cursor == 0:
if os.path.exists('target.txt'):
os.remove('target.txt')
cursor = 0
response = requests.get('https://m.dcdapp.com/motor/koubei_api/koubei_list_v2?aid=36&count=5&series_id=1145&max_cursor='+str(max_cursor)+'&tag_id=0&sort_type=1')
response_text = response.text
format = json.loads(response_text)
has_more = 0
if 'data' in format.keys():
data = format['data']
if 'paging' in data.keys():
paging = data['paging']
if 'count' in paging.keys():
has_more = paging['has_more']
if 'list' in data.keys():
list = data['list']
cursor = getStartMaxCursor(list)
getKeyWords(list)
if (has_more != 0) and (cursor != 0):
time.sleep(3)
getComments(cursor)
'''
利用文件生成词云
'''
def generateWordCloudByFile():
file = open('target.txt', 'r')
all_text = ''
try:
text_lines = file.readlines()
for line in text_lines:
all_text = all_text + line
finally:
file.close()
plt.rcParams["font.sans-serif"] = ["SimHei"]
wc = WordCloud(
# 设置字体,不指定就会出现乱码
# 设置背景色
background_color='white',
# 设置背景宽
width=1500,
# 设置背景高
height=500,
font_path='./simhei.ttf',
# 最大字体
max_font_size=80,
# 最小字体
min_font_size=30,
mode='RGBA'
# colormap='pink'
)
wc.generate(all_text)
# 保存图片
if os.path.exists('wordcloud.png'):
os.remove('wordcloud.png')
wc.to_file(r"wordcloud.png") # 按照设置的像素宽高度保存绘制好的词云图,比下面程序显示更清晰
# 4.显示图片
# 指定所绘图名称
plt.figure("口碑结果")
# 以图片的形式显示词云
plt.imshow(wc)
# 关闭图像坐标系
plt.axis("off")
plt.show()
'''
利用分词生成词云
'''
def generateWordCloudByFenCi():
file = open('target.txt', 'r')
keywords = {}
all_text = ''
try:
text_lines = file.readlines()
for line in text_lines:
all_text = all_text + line
cut = psg.cut(line)
# #筛选为名词的
# print([(x.word,x.flag) for x in result if x.flag=='n'])
# [(x.word, x.flag) for x in cut]
# 只得到形容词,进行词云的输出
for item in cut:
if item.flag != 'a':
continue
if item.word in keywords.keys():
keywords[item.word] = keywords[item.word] + 1
else:
keywords[item.word] = 1
finally:
file.close()
plt.rcParams["font.sans-serif"] = ["SimHei"]
print(keywords)
wc = WordCloud(
# 设置字体,不指定就会出现乱码
# 设置背景色
background_color='white',
# 设置背景宽
width=1500,
# 设置背景高
height=500,
font_path='./simhei.ttf',
# 最大字体
max_font_size=80,
# 最小字体
min_font_size=30,
mode='RGBA',
stopwords=['的','了','我','也',',']
# colormap='pink'
)
# generate_from_frequencies适用于我已知词及其对应的词频是多少(已有数据库),不需要分词的情况下。
wc.generate_from_frequencies(keywords)
#wc.generate(all_text)
# 保存图片
if os.path.exists('wordcloud.png'):
os.remove('wordcloud.png')
wc.to_file(r"wordcloud.png") # 按照设置的像素宽高度保存绘制好的词云图,比下面程序显示更清晰
# 4.显示图片
# 指定所绘图名称
plt.figure("口碑结果")
# 以图片的形式显示词云
plt.imshow(wc)
# 关闭图像坐标系
plt.axis("off")
plt.show()
# 使用http获得当前所有的口碑评论
getComments()
'根据文件输出词云'
generateWordCloudByFile()
'根据分词输出词云'
generateWordCloudByFenCi()