电商评论分析

T.S.Hao

已于 2023-01-30 22:47:55 修改

阅读量1k

点赞数

文章标签：数据库可视化 sqlite python 网络爬虫

于 2021-01-27 10:48:48 首次发布

本文链接：https://blog.csdn.net/weixin_47366822/article/details/113243145

版权

电商评论分析

import json
import sqlite3
import requests
import jieba
import PIL.Image as image
import numpy as np
from wordcloud import WordCloud

# ----------------------------数据爬取--------------------- #
# 爬取的页数为1到2页
for i in range(1, 2):
    print("正在爬取第" + str(i) + "页")
    first = 'https://rate.tmall.com/list_detail_rate.htm?itemId=681081143790&spuId=2700517147&sellerId=883737303&order=3&currentPage='
    last = '&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098'
    base_url = first + str(i) + last

# 可能还需要伪造的是
headers = {
    # 没有user-agent无法正常访问
    'Cookie': 'miid=9096124061225870467; cna=XmfqG8gmnzUCAd8Lc7lyVLZ0; isg=BHZ2kvUvCWAXv_1GaZDgundzxKx4l7rRUCPnVuBdutifIxS9SSS74KuVO39PkLLp; l=fBjNRyQuTJqIOv0ABO5Brurza77tzQR4zkPzaNbMiIEGa1ROaO32eNCFc1z6udtfgT5DxeKrcdiaydFe7xa38x_ceTwHhx126dp68etzRyMc.; tfstk=civOBP4l420GcN11YOh33phfj4XOaerOol_YHLTjSZ7SW-yYFsVMELQxZfsVzNnd.; t=f0e5b93f2e60e43b6ca21b424321e70a; _tb_token_=7855531455e03; cookie2=151902a75fed1637eea6b8c26507082e; xlly_s=1; dnk=tb639092880469; uc1=existShop=false&cookie14=UoezTpdwRSpKEQ%3D%3D&cookie15=WqG3DMC9VAQiUQ%3D%3D&cookie21=UtASsssmfufd&pas=0&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D; uc3=lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dCvjEaippJ%2FOZzQYo%3D&nk2=F5RDL9RrSUO5zO06UcA%3D&id2=UUpgRSiY3km8Hl%2FuLA%3D%3D; tracknick=tb639092880469; lid=tb639092880469; uc4=nk4=0%40FY4I7KuBNAwldwuEKIk8BhTXlkAk8%2FNSHw%3D%3D&id4=0%40U2gqykAhsVeN4uYDJNSR1g%2BhqTTtD%2FoQ; _l_g_=Ug%3D%3D; unb=2213073703629; lgc=tb639092880469; cookie1=UtQOOP7IAl9hr7OOv6P1HGpRsjNddhsgBstLi%2FzJgiA%3D; login=true;',
    'referer': 'https://detail.tmall.com/item_o.htm?id=681081143790&sku_properties=5919063:6536025;122216431:27772',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}

# ------------------------------创建数据库——————---------------------------- #
connect = sqlite3.connect('./taobaosqlite.db')
# 从会话中生成游标
cursor = connect.cursor()
cursor.execute("""
    CREATE TABLE IF NOT EXISTS comment(
        cid INTEGER PRIMARY KEY,
        rateContent TEXT,
        rateDate TEXT,
        auctionSku TEXT
    );
""")
resp = requests.get(base_url, headers=headers)
comments = resp.text
print(comments)
comments_json = comments.split("(")[1].split(")")[0]
# 将商品json写入文件作为检测
with open('./电商评论.json', mode='w', encoding='utf-8') as fp:
    fp.write(comments_json)

comments_obj = json.loads(comments_json)
print(comments_obj)
rateDetail = comments_obj['rateDetail']
rateList = rateDetail['rateList']
# file.close()
for c in range(len(rateList)):
    cid = rateList[c]['id']
    rateContent = rateList[c]['rateContent']
    rateDate = rateList[c]['rateDate']
    auctionSku = rateList[c]['auctionSku']
    print('-' * 100)
    print(cid, rateContent)
    cursor.execute(
        """insert or ignore into comment (cid, rateContent, rateDate, auctionSku) values (?,?,?,?);""",
        [cid, rateContent, rateDate, auctionSku])
# 提交确认（插入和更新）
connect.commit()
cursor.execute("""
    select * from comment;
""")
# 取出查询数据
rs = cursor.fetchall()
print(rs)

# ------------------------------取出数据库并进行分词——————---------------------------- #
cursor.execute("""select * from comment order by rateDate desc limit 0,9;""")
comments_rs = cursor.fetchall()
comments = [c[1] for c in comments_rs]
comments = ''.join(comments)
words = jieba.cut(comments, cut_all=False)
# generator object
comment_words_list = list(words)

with open('../L05/dict/stop_words_zh.txt', mode='r', encoding='utf-8') as f:
    stop_words = f.read().splitlines()

filtered_comment_word_list = []
for word in comment_words_list:
    if word not in stop_words:
        filtered_comment_word_list.append(word)

comment_words_str = ' '.join(filtered_comment_word_list)
print(comment_words_str)

# 关闭游标
cursor.close()
# 关闭数据库
connect.close()

# ——————————————————————————————————数据可视化———————————————————————————————————————— #
wc = WordCloud(
    font_path='./问藏书房.ttf',
    background_color='black',
    mask=np.array(image.open('./三角形.jpg')),
    width=1000,
    height=800,
    max_words=500,
    min_font_size=50,
).generate(comment_words_str)
wc.to_file('./电商评论词云图.png')