爬虫最终版(评论爬取)


京东评论爬取->入库 ->数据可视化


提示:以下是本篇文章正文内容,下面案例可供参考

# 导包
import pygal
import json
import sqlite3
import time
import requests
from wordcloud import WordCloud
import jieba

# 爬取网页评论
def get_one_product_one_page_comments(pid, pageno=1):
    """
    取一个商品的一页评论
    :param pid: 商品id
    :param pageno: 评论第n页
    :return: [{'content': ''}, {}]
    """
    base_url = 'https://club.jd.com/comment/productPageComments.action'

    # 本次请求头只用伪造user-agent即可,但前端时间测试需要cookie字段
    headers = {
        # 'Cookie': '........',
        # 'Referer': 'https://item.jd.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
    }

    # tips:从开发者工具network请求头下面的query params复制下来再调整。使用编辑器列编辑模式 alt+shift+鼠标拖动。
    params = {
        # 'callback': 'fetchJSON_comment98',
        'productId': pid,  # 商品id
        'score': 0,
        'sortType': 5,
        'page': pageno,  # 第n页  经测试最大99页,之后只有概括数据无详细数据。
        'pageSize': 10,
        'isShadowSku': 0,
        'rid': 0,
        'fold': 1
    }

    # for i in range(1, 20):
    #   params['page'] = i
    resp = requests.get(base_url, headers=headers, params=params)
    status_code = resp.status_code
    comments_json = resp.text
    # print(comments_json)
    # 京东评论接口返回jsonp格式,涉及跨域问题。需要将先jsonp转json。
    # 方法1:python字符串方法删除固定长度无用字符串;2(推荐)上网找从jsonp过滤json正则;3本例中发现修改参数可以直接返回json

    comments_obj = json.loads(comments_json)
    # print(comments_obj)
    comments = comments_obj['comments']
    return comments

# 将爬取的评论写入数据库
def write_comment_to_db(c, cursor):
    cid = c['id']
    content = c['content']
    creation_time = c['creationTime']
    images = c.get('images', None)
    product_color = c['productColor']
    product_Size = c['productSize']


    # 一条评论写入数据库
    cursor.execute("""
        insert into jd_comments (cid, content, product_color, creation_time)
        values (?, ?, ?, ?);
    """, [cid, content, product_color, creation_time])

# 数据库初始化
def db_init():
    connect = sqlite3.connect('../l05/textsqlite.db')
    cursor = connect.cursor()
    cursor.execute("""
            CREATE TABLE IF NOT EXISTS jd_comments (
            id INTEGER PRIMARY KEY,
            cid INTEGER,
            content TEXT,
            product_color TEXT,
            creation_time DATETIME
        );
        """)
    return connect, cursor

# 从数据库中调取评论,过滤,并生成词图
def get_comments_show_WordCloud():
    cursor.execute("""select * from jd_comments order by creation_time desc limit 0,200;""")
    comments_rs = cursor.fetchall()

    comments = [c[2] for c in comments_rs]
    comments = ' '.join(comments)

    comment_words = jieba.cut(comments, cut_all=False)
    comment_words = [w for w in comment_words]
    # print(comment_words)

    with open('./dict/stop_words_zh.txt', mode='r', encoding='utf-8') as f:
        stop_words = f.read().splitlines()

    filtered_comment_word_list = []
    for word in comment_words:
        if word not in stop_words:
            filtered_comment_word_list.append(word)

    comment_word_str = ' '.join(filtered_comment_word_list)
    # print(comment_word_str)

    wc = WordCloud(
        font_path='./FZSTK.TTF',  # 中文字体,成Windows自带的
        background_color='white',
        width=1000,
        height=800,
        min_font_size=50,
    ).generate(comment_word_str)

    wc.to_file('./评论词云图.png')

# 获取评论得到评估
def get_comments_show_score():
    cursor.execute("""select * from jd_comments order by creation_time desc limit 0,100;""")
    comments_rs = cursor.fetchall()  # [(id, cid, content,product_color),()]  [{id:3,content: ' )

    comments = [c[2] for c in comments_rs]  # [i*2 for i in list] #比上述循环简单
    comments = ''.join(comments)  # 先拼接成长字符串

    comment_words = jieba.cut(comments, cut_all=False)
    comment_words = [w for w in comment_words]
    # print(comment_words)

    with open('./dict/stop_words_zh.txt', mode='r', encoding='utf-8') as f:
        stop_words = f.read().splitlines()

    filtered_comment_word_list = []
    for word in comment_words:
        if word not in stop_words:
            filtered_comment_word_list.append(word)

    with open('./dict/emotion_dict/neg_all_dict.txt', mode='r', encoding='utf-8') as f:
        negative_words = f.read().splitlines()
        # print('negative_words', negative_words[:20])

    negative_words_num = 0
    for comment_word in filtered_comment_word_list:
        if comment_word in negative_words:
            negative_words_num += 1

    with open('./dict/emotion_dict/pos_all_dict.txt', mode='r', encoding='utf-8') as f:
        positive_words = f.read().splitlines()
        # print(' positive_words ', positive_words[:20])

    positive_words_num = 0
    for comment_word in filtered_comment_word_list:
        if comment_word in positive_words:
            positive_words_num += 1

    # 统计程度词汇个数
    from L06.dict.degree_dict.degree_dict import DEGREE_ALL
    degree_words_num = 0
    for comment_word in filtered_comment_word_list:
        if comment_word in DEGREE_ALL:
            degree_words_num += 0.1

    # 算出商品(评论情感化分析)满意度估算分,推算商品是否会持续热卖。
    score = (positive_words_num * 1 - negative_words_num * 3) / 5
    score = score * (degree_words_num)
    print(score)
    if 0 < score < 600:
        print('用户对商品满意度低')
    elif 600 <= score < 1000:
        print('用户对商品满意度中')
    elif score >= 1000:
        print('用户对商品满意度高')

# 得到用户总数
def get_user_sum():
    cursor.execute("""select count(id) from jd_comments;""")
    comment_amount = cursor.fetchall()
    user_sum = comment_amount[0][0]
    # print('用户总数:', user_amount)
    return user_sum

# 得到颜色列表
def get_color_data():
    cursor.execute("""select product_color,count(id)  from jd_comments group by product_color;""")
    rs = cursor.fetchall()
    return rs

# 得到饼状图
def show_pie_chart(rs, user_sum, file_path):
    colors = {

    }

    for r in rs:
        num = r[1]
        color = r[0]
        colors[color] = round(num/user_sum*100, 2)
    print('colors ', colors)

    pie_chart = pygal.Pie()
    pie_chart.title = 'iphone12购买颜色比例(%)'
    pie_chart.add('白色', colors['白色'])
    pie_chart.add('红色', colors['红色'])
    pie_chart.add('绿色', colors['绿色'])
    pie_chart.add('蓝色', colors['蓝色'])
    pie_chart.add('黑色', colors['黑色'])
    pie_chart.render_to_file(file_path)

# 筛选出日期与评论数
def get_user_amount():
    cursor.execute("""select substr(creation_time,1,10),count(id) from jd_comments group by substr(creation_time,1,10);""")
    comment_date_amount = cursor.fetchall()
    # print(comment_date_amount[:20][0])
    return comment_date_amount

# 得到日期和评论数的列表
def get_data_list(comment_date_amount):
    comment_date = []
    for i in comment_date_amount[10:30]:
        comment_date.append(i[0])
    # print(comment_date)

    comment_sum = []
    for i in comment_date_amount[10:30]:
        comment_sum.append(i[1])
    return comment_date, comment_sum

# 得到日期和评论数的折线图
def shoe_line_chart(comment_date, comment_sum, file_path):
    date_chart = pygal.Line(x_label_rotation=20)
    date_chart.x_labels = map(str, comment_date)
    date_chart.add('comments_count', comment_sum)

    date_chart.render_to_file(file_path)

if __name__ == '__main__':

    connect, cursor = db_init()

    # 产品ID
    product_id = 100009077475
    for pageno in range(1, 30):

        one_page_comments = get_one_product_one_page_comments(product_id, pageno)
        for c in one_page_comments:
            write_comment_to_db(c, cursor)

        connect.commit()
        print(f'第{pageno}页数据插入完成')
        time.sleep(1)

    # 词云图
    get_comments_show_WordCloud()

    # 评论情感分数
    get_comments_show_score()

    # 饼状图
    file_path1 = './show_pie_chart.svg'
    user_sum = get_user_sum()
    print('用户总数:',user_sum)
    rs = get_color_data()
    show_pie_chart(rs, user_sum, file_path1)

    # 折线图
    file_path2 = './shoe_line_chart.svg'
    comment_date_amount = get_user_amount()
    comment_date, comment_sum = get_data_list(comment_date_amount)
    shoe_line_chart(comment_date, comment_sum, file_path2)

    # 关闭连接
    connect.close()

实验结果:

  • 数据插入成功

在这里插入图片描述
在这里插入图片描述

  • 词云图

在这里插入图片描述

  • 用户购买颜色(饼状图)

在这里插入图片描述

  • 折线图

在这里插入图片描述

以上就是这个实验的全部内容,以后会演这个方向深入研究


  • 2
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值