爬取京东手机评论区的用户评论图片

爬取京东评论图片

分析京东页面情况,数据是动态加载的,用到selenium来滚动页面让他加载完全数据。

import json
import time
import urllib
import jsonpath
import requests
import lxml
from lxml import etree
from selenium import webdriver
import os


def getProductIdsByKeyword(keyword):
    """一级页面获取id值"""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    # url = 'https://search.jd.com/Search?keyword=' + keyword + '&psort=4&psort=4&click=0'
    # url = 'https://search.jd.com/Search?keyword=' + keyword + '&psort=3&psort=3&page=3&s=61&click=0'
    url = 'https://search.jd.com/Search?keyword=' + keyword + '&suggest=1.his.0.0&wq='+keyword+'&page=5&s=116&click=0'
    driver.get(url=url)
    time.sleep(1)
    driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    time.sleep(2)
    html_source = driver.page_source
    html = lxml.etree.HTML(html_source)
    productIds = html.xpath("//li[@class='gl-item']/@data-sku")
    # print(productIds)
    return productIds


def getJdCommentsImage(startPage, endPage, productId, path):
    """找到评论图片拼接地址拿取json格式里的图片url"""
    num = 1
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'authority': 'club.jd.com',
        'method': 'GET',
        'scheme': 'https',
        'Accept': '*/*', 'Accept - Encoding': 'gzip, deflate, br'
    }
    header['path'] = '/discussion/getProductPageImageCommentList.action?productId=' + productId + '&page=' + str(
        num) + '&pageSize=10'
    header['Referer'] = 'https://item.jd.com/' + productId + '.html'
    requests.packages.urllib3.disable_warnings()
    for num in range(startPage, (endPage + 1)):
        url = 'https://club.jd.com/discussion/getProductPageImageCommentList.action?productId=' + productId + '&page=' + str(
            num) + '&pageSize=10'

        images = requests.post(url, headers=header, verify=False, timeout=10)
        jsonObjs = json.loads(images.text)
        images1 = jsonpath.jsonpath(jsonObjs, '$..imageUrl')
        i = 1
        for image_url in images1:
            print('*' * 10 + '正在下载第' + str((num - 1) * 10 + i) + '张图片' + '*' * 10)
            try:
                res = urllib.request.urlopen('https:' + image_url, timeout=5).read()
                with open(path + productids[j] + str((num - 1) * 10 + i) + '.jpg', 'wb') as file:
                    file.write(res)
                    file.close()
            except Exception as e:
                print('第' + str((num - 1) * 10 + i) + '张图片下载出错,错误信息如下:')
                print(' ' * 10 + str(e))
                print('')
                continue
            finally:
                i += 1

    print('*' * 15 + '下载完成' + '*' * 15)


# getJdCommentsImage(1,10,'d:/download/')  # 一页10张 (起始页,结束页,图片存储路径)


if __name__ == '__main__':
    keywords = '手机'  # 分类关键字在这里放在这里
    productids = getProductIdsByKeyword(keywords)
    print(productids)
    path = 'E:/downloadPage3/' + keywords + '/'

    if not os.path.exists(path):
        os.makedirs(path)
    for j in range(len(productids)):
        try:
            getJdCommentsImage(1, 50, productids[j], path)
        except Exception as e:
            print(str(e))
            continue

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值