爬取京东评论图片
分析京东页面情况,数据是动态加载的,用到selenium来滚动页面让他加载完全数据。
import json
import time
import urllib
import jsonpath
import requests
import lxml
from lxml import etree
from selenium import webdriver
import os
def getProductIdsByKeyword(keyword):
"""一级页面获取id值"""
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
# url = 'https://search.jd.com/Search?keyword=' + keyword + '&psort=4&psort=4&click=0'
# url = 'https://search.jd.com/Search?keyword=' + keyword + '&psort=3&psort=3&page=3&s=61&click=0'
url = 'https://search.jd.com/Search?keyword=' + keyword + '&suggest=1.his.0.0&wq='+keyword+'&page=5&s=116&click=0'
driver.get(url=url)
time.sleep(1)
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(2)
html_source = driver.page_source
html = lxml.etree.HTML(html_source)
productIds = html.xpath("//li[@class='gl-item']/@data-sku")
# print(productIds)
return productIds
def getJdCommentsImage(startPage, endPage, productId, path):
"""找到评论图片拼接地址拿取json格式里的图片url"""
num = 1
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'authority': 'club.jd.com',
'method': 'GET',
'scheme': 'https',
'Accept': '*/*', 'Accept - Encoding': 'gzip, deflate, br'
}
header['path'] = '/discussion/getProductPageImageCommentList.action?productId=' + productId + '&page=' + str(
num) + '&pageSize=10'
header['Referer'] = 'https://item.jd.com/' + productId + '.html'
requests.packages.urllib3.disable_warnings()
for num in range(startPage, (endPage + 1)):
url = 'https://club.jd.com/discussion/getProductPageImageCommentList.action?productId=' + productId + '&page=' + str(
num) + '&pageSize=10'
images = requests.post(url, headers=header, verify=False, timeout=10)
jsonObjs = json.loads(images.text)
images1 = jsonpath.jsonpath(jsonObjs, '$..imageUrl')
i = 1
for image_url in images1:
print('*' * 10 + '正在下载第' + str((num - 1) * 10 + i) + '张图片' + '*' * 10)
try:
res = urllib.request.urlopen('https:' + image_url, timeout=5).read()
with open(path + productids[j] + str((num - 1) * 10 + i) + '.jpg', 'wb') as file:
file.write(res)
file.close()
except Exception as e:
print('第' + str((num - 1) * 10 + i) + '张图片下载出错,错误信息如下:')
print(' ' * 10 + str(e))
print('')
continue
finally:
i += 1
print('*' * 15 + '下载完成' + '*' * 15)
# getJdCommentsImage(1,10,'d:/download/') # 一页10张 (起始页,结束页,图片存储路径)
if __name__ == '__main__':
keywords = '手机' # 分类关键字在这里放在这里
productids = getProductIdsByKeyword(keywords)
print(productids)
path = 'E:/downloadPage3/' + keywords + '/'
if not os.path.exists(path):
os.makedirs(path)
for j in range(len(productids)):
try:
getJdCommentsImage(1, 50, productids[j], path)
except Exception as e:
print(str(e))
continue