文章目录
京东评论爬取->入库 ->数据可视化
提示:以下是本篇文章正文内容,下面案例可供参考
# 导包
import pygal
import json
import sqlite3
import time
import requests
from wordcloud import WordCloud
import jieba
# 爬取网页评论
def get_one_product_one_page_comments(pid, pageno=1):
"""
取一个商品的一页评论
:param pid: 商品id
:param pageno: 评论第n页
:return: [{'content': ''}, {}]
"""
base_url = 'https://club.jd.com/comment/productPageComments.action'
# 本次请求头只用伪造user-agent即可,但前端时间测试需要cookie字段
headers = {
# 'Cookie': '........',
# 'Referer': 'https://item.jd.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
# tips:从开发者工具network请求头下面的query params复制下来再调整。使用编辑器列编辑模式 alt+shift+鼠标拖动。
params = {
# 'callback': 'fetchJSON_comment98',
'productId': pid, # 商品id
'score': 0,
'sortType': 5,
'page': pageno, # 第n页 经测试最大99页,之后只有概括数据无详细数据。
'pageSize': 10,
'isShadowSku': 0,
'rid': 0,
'fold': 1
}
# for i in range(1, 20):
# params['page'] = i
resp = requests.get(base_url, headers=headers, params=params)
status_code = resp.status_code
comments_json = resp.text
# print(comments_json)
# 京东评论接口返回jsonp格式,涉及跨域问题。需要将先jsonp转json。
# 方法1:python字符串方法删除固定长度无用字符串;2(推荐)上网找从jsonp过滤json正则;3本例中发现修改参数可以直接返回json
comments_obj = json.loads(comments_json)
# print(comments_obj)
comments = comments_obj['comments']
return comments
# 将爬取的评论写入数据库
def write_comment_to_db(c, cursor):
cid = c['id']
content = c['content']
creation_time = c['creationTime']
images = c.get('images', None)
product_color = c['productColor']
product_Size = c['productSize']
# 一条评论写入数据库
cursor.execute("""
insert into jd_comments (cid, content, product_color, creation_time)
values (?, ?, ?, ?);
""", [cid, content, product_color, creation_time])
# 数据库初始化
def db_init():
connect = sqlite3.connect('../l05/textsqlite.db')
cursor = connect.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS jd_comments (
id INTEGER PRIMARY KEY,
cid INTEGER,
content TEXT,
product_color TEXT,
creation_time DATETIME
);
""")
return connect, cursor
# 从数据库中调取评论,过滤,并生成词图
def get_comments_show_WordCloud():
cursor.execute("""select * from jd_comments order by creation_time desc limit 0,200;""")
comments_rs = cursor.fetchall()
comments = [c[2] for c in comments_rs]
comments = ' '.join(comments)
comment_words = jieba.cut(comments, cut_all=False)
comment_words = [w for w in comment_words]
# print(comment_words)
with open('./dict/stop_words_zh.txt', mode='r', encoding='utf-8') as f:
stop_words = f.read().splitlines()
filtered_comment_word_list = []
for word in comment_words:
if word not in stop_words:
filtered_comment_word_list.append(word)
comment_word_str = ' '.join(filtered_comment_word_list)
# print(comment_word_str)
wc = WordCloud(
font_path='./FZSTK.TTF', # 中文字体,成Windows自带的
background_color='white',
width=1000,
height=800,
min_font_size=50,
).generate(comment_word_str)
wc.to_file('./评论词云图.png')
# 获取评论得到评估
def get_comments_show_score():
cursor.execute("""select * from jd_comments order by creation_time desc limit 0,100;""")
comments_rs = cursor.fetchall() # [(id, cid, content,product_color),()] [{id:3,content: ' )
comments = [c[2] for c in comments_rs] # [i*2 for i in list] #比上述循环简单
comments = ''.join(comments) # 先拼接成长字符串
comment_words = jieba.cut(comments, cut_all=False)
comment_words = [w for w in comment_words]
# print(comment_words)
with open('./dict/stop_words_zh.txt', mode='r', encoding='utf-8') as f:
stop_words = f.read().splitlines()
filtered_comment_word_list = []
for word in comment_words:
if word not in stop_words:
filtered_comment_word_list.append(word)
with open('./dict/emotion_dict/neg_all_dict.txt', mode='r', encoding='utf-8') as f:
negative_words = f.read().splitlines()
# print('negative_words', negative_words[:20])
negative_words_num = 0
for comment_word in filtered_comment_word_list:
if comment_word in negative_words:
negative_words_num += 1
with open('./dict/emotion_dict/pos_all_dict.txt', mode='r', encoding='utf-8') as f:
positive_words = f.read().splitlines()
# print(' positive_words ', positive_words[:20])
positive_words_num = 0
for comment_word in filtered_comment_word_list:
if comment_word in positive_words:
positive_words_num += 1
# 统计程度词汇个数
from L06.dict.degree_dict.degree_dict import DEGREE_ALL
degree_words_num = 0
for comment_word in filtered_comment_word_list:
if comment_word in DEGREE_ALL:
degree_words_num += 0.1
# 算出商品(评论情感化分析)满意度估算分,推算商品是否会持续热卖。
score = (positive_words_num * 1 - negative_words_num * 3) / 5
score = score * (degree_words_num)
print(score)
if 0 < score < 600:
print('用户对商品满意度低')
elif 600 <= score < 1000:
print('用户对商品满意度中')
elif score >= 1000:
print('用户对商品满意度高')
# 得到用户总数
def get_user_sum():
cursor.execute("""select count(id) from jd_comments;""")
comment_amount = cursor.fetchall()
user_sum = comment_amount[0][0]
# print('用户总数:', user_amount)
return user_sum
# 得到颜色列表
def get_color_data():
cursor.execute("""select product_color,count(id) from jd_comments group by product_color;""")
rs = cursor.fetchall()
return rs
# 得到饼状图
def show_pie_chart(rs, user_sum, file_path):
colors = {
}
for r in rs:
num = r[1]
color = r[0]
colors[color] = round(num/user_sum*100, 2)
print('colors ', colors)
pie_chart = pygal.Pie()
pie_chart.title = 'iphone12购买颜色比例(%)'
pie_chart.add('白色', colors['白色'])
pie_chart.add('红色', colors['红色'])
pie_chart.add('绿色', colors['绿色'])
pie_chart.add('蓝色', colors['蓝色'])
pie_chart.add('黑色', colors['黑色'])
pie_chart.render_to_file(file_path)
# 筛选出日期与评论数
def get_user_amount():
cursor.execute("""select substr(creation_time,1,10),count(id) from jd_comments group by substr(creation_time,1,10);""")
comment_date_amount = cursor.fetchall()
# print(comment_date_amount[:20][0])
return comment_date_amount
# 得到日期和评论数的列表
def get_data_list(comment_date_amount):
comment_date = []
for i in comment_date_amount[10:30]:
comment_date.append(i[0])
# print(comment_date)
comment_sum = []
for i in comment_date_amount[10:30]:
comment_sum.append(i[1])
return comment_date, comment_sum
# 得到日期和评论数的折线图
def shoe_line_chart(comment_date, comment_sum, file_path):
date_chart = pygal.Line(x_label_rotation=20)
date_chart.x_labels = map(str, comment_date)
date_chart.add('comments_count', comment_sum)
date_chart.render_to_file(file_path)
if __name__ == '__main__':
connect, cursor = db_init()
# 产品ID
product_id = 100009077475
for pageno in range(1, 30):
one_page_comments = get_one_product_one_page_comments(product_id, pageno)
for c in one_page_comments:
write_comment_to_db(c, cursor)
connect.commit()
print(f'第{pageno}页数据插入完成')
time.sleep(1)
# 词云图
get_comments_show_WordCloud()
# 评论情感分数
get_comments_show_score()
# 饼状图
file_path1 = './show_pie_chart.svg'
user_sum = get_user_sum()
print('用户总数:',user_sum)
rs = get_color_data()
show_pie_chart(rs, user_sum, file_path1)
# 折线图
file_path2 = './shoe_line_chart.svg'
comment_date_amount = get_user_amount()
comment_date, comment_sum = get_data_list(comment_date_amount)
shoe_line_chart(comment_date, comment_sum, file_path2)
# 关闭连接
connect.close()
实验结果:
- 数据插入成功
- 词云图
- 用户购买颜色(饼状图)
- 折线图
以上就是这个实验的全部内容,以后会演这个方向深入研究