电商评论分析
import json
import sqlite3
import requests
import jieba
import PIL.Image as image
import numpy as np
from wordcloud import WordCloud
for i in range(1, 2):
print("正在爬取第" + str(i) + "页")
first = 'https://rate.tmall.com/list_detail_rate.htm?itemId=681081143790&spuId=2700517147&sellerId=883737303&order=3¤tPage='
last = '&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098'
base_url = first + str(i) + last
headers = {
'Cookie': 'miid=9096124061225870467; cna=XmfqG8gmnzUCAd8Lc7lyVLZ0; isg=BHZ2kvUvCWAXv_1GaZDgundzxKx4l7rRUCPnVuBdutifIxS9SSS74KuVO39PkLLp; l=fBjNRyQuTJqIOv0ABO5Brurza77tzQR4zkPzaNbMiIEGa1ROaO32eNCFc1z6udtfgT5DxeKrcdiaydFe7xa38x_ceTwHhx126dp68etzRyMc.; tfstk=civOBP4l420GcN11YOh33phfj4XOaerOol_YHLTjSZ7SW-yYFsVMELQxZfsVzNnd.; t=f0e5b93f2e60e43b6ca21b424321e70a; _tb_token_=7855531455e03; cookie2=151902a75fed1637eea6b8c26507082e; xlly_s=1; dnk=tb639092880469; uc1=existShop=false&cookie14=UoezTpdwRSpKEQ%3D%3D&cookie15=WqG3DMC9VAQiUQ%3D%3D&cookie21=UtASsssmfufd&pas=0&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D; uc3=lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dCvjEaippJ%2FOZzQYo%3D&nk2=F5RDL9RrSUO5zO06UcA%3D&id2=UUpgRSiY3km8Hl%2FuLA%3D%3D; tracknick=tb639092880469; lid=tb639092880469; uc4=nk4=0%40FY4I7KuBNAwldwuEKIk8BhTXlkAk8%2FNSHw%3D%3D&id4=0%40U2gqykAhsVeN4uYDJNSR1g%2BhqTTtD%2FoQ; _l_g_=Ug%3D%3D; unb=2213073703629; lgc=tb639092880469; cookie1=UtQOOP7IAl9hr7OOv6P1HGpRsjNddhsgBstLi%2FzJgiA%3D; login=true;',
'referer': 'https://detail.tmall.com/item_o.htm?id=681081143790&sku_properties=5919063:6536025;122216431:27772',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
connect = sqlite3.connect('./taobaosqlite.db')
cursor = connect.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS comment(
cid INTEGER PRIMARY KEY,
rateContent TEXT,
rateDate TEXT,
auctionSku TEXT
);
""")
resp = requests.get(base_url, headers=headers)
comments = resp.text
print(comments)
comments_json = comments.split("(")[1].split(")")[0]
with open('./电商评论.json', mode='w', encoding='utf-8') as fp:
fp.write(comments_json)
comments_obj = json.loads(comments_json)
print(comments_obj)
rateDetail = comments_obj['rateDetail']
rateList = rateDetail['rateList']
for c in range(len(rateList)):
cid = rateList[c]['id']
rateContent = rateList[c]['rateContent']
rateDate = rateList[c]['rateDate']
auctionSku = rateList[c]['auctionSku']
print('-' * 100)
print(cid, rateContent)
cursor.execute(
"""insert or ignore into comment (cid, rateContent, rateDate, auctionSku) values (?,?,?,?);""",
[cid, rateContent, rateDate, auctionSku])
connect.commit()
cursor.execute("""
select * from comment;
""")
rs = cursor.fetchall()
print(rs)
cursor.execute("""select * from comment order by rateDate desc limit 0,9;""")
comments_rs = cursor.fetchall()
comments = [c[1] for c in comments_rs]
comments = ''.join(comments)
words = jieba.cut(comments, cut_all=False)
comment_words_list = list(words)
with open('../L05/dict/stop_words_zh.txt', mode='r', encoding='utf-8') as f:
stop_words = f.read().splitlines()
filtered_comment_word_list = []
for word in comment_words_list:
if word not in stop_words:
filtered_comment_word_list.append(word)
comment_words_str = ' '.join(filtered_comment_word_list)
print(comment_words_str)
cursor.close()
connect.close()
wc = WordCloud(
font_path='./问藏书房.ttf',
background_color='black',
mask=np.array(image.open('./三角形.jpg')),
width=1000,
height=800,
max_words=500,
min_font_size=50,
).generate(comment_words_str)
wc.to_file('./电商评论词云图.png')