最近开始搞爬虫了,业务遇到的一个需求就是按excel里面的条件,下载京东和天猫的商品图片,包括商品头图和详情图。
对于京东来说,下载头图没什么难度,但是下载详情图却不容易,因为京东商品详情图的图片并不在html源码里面,需要用浏览器的开发者工具的network抓包分析ajax请求才能得到具体的图片地址。
对于天猫来说,抓头图也没什么难度,但是下载详情图也不容易,因为天猫的商品详情图需要登录才能看到,但是我用requests库没有分析出来怎么登录,后来发现天猫的手机版没有做防爬措施,手机版不用登录也能看到商品详情图,真是太好了,世界一下欢乐多了。
业务提供的excel表如下:
我的python代码如下:
# -*- coding:utf8 -*-
import requests
from lxml import etree
import pandas as pd
import os
import re
class DownLoadPic(object):
def __init__(self, excel_path):
self.excel_path = excel_path
def get_excel_data(self):
"""
从excel里面获取数据
:return:
"""
df = pd.read_excel(self.excel_path)
return df.values
@staticmethod
def switch_tm_url(url):
"""
把天猫url转换为手机版的url,因为手机版的比电脑版的反爬限制少
:return:
"""
url = url.replace('detail.tmall.com', 'detail.m.tmall.com')
return url
@staticmethod
def session_():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/77.0.3865.90 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': '*/*',
'Connection': 'keep-alive'}
s = requests.Session()
s.headers.update(headers)
return s
def get_tm_header_image_url_list(self, url):
"""
获取天猫头图的url
:return:
"""
s = self.session_()
html = s.get(url).text
html_obj = etree.HTML(html)
header_image_url_list = html_obj.xpath(
'/html/body/div[1]/div[2]/div/div[2]/div[1]/div/section/div/a/img/@data-src')
del html
del html_obj
return header_image_url_list
def get_tm_detail_image_url_list(self, url):
"""
获取天猫详情图的url
:return:
"""
s = self.session_()
html = s.get(url).text
html_obj = etree.HTML(html)
detail_image_url_list = html_obj.xpath('//*[@id="modules-desc"]/div/div/div/div/img/@data-ks-lazyload')
del html
del html_obj
return detail_image_url_list
def get_jd_header_image_url_list(self, url):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
r = requests.get(url=url)
html = r.text
html_obj = etree.HTML(html)
result = []
for img in html_obj.xpath("//ul/li/img/@src"):
if img.find('https:')
img_url = "https:" + img
else:
img_url = img
header_img_url = img_url.replace("s54x54_", "").replace("s75x75_", "").replace("n5/", "cv/s1080x1080_")
result.append(header_img_url)
# print(header_img_url)
del html
del html_obj
return result
def get_jd_detail_image_url_list(self, url):
"""
参考文章:https://www.jianshu.com/p/9de3be54abc1
"""
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
result = []
# src
r = requests.get(url=url, headers=headers)
html = r.text
js_url = re.findall('cd.jd.com/description/channel\?skuId=[\d]*&mainSkuId=[\d]*&cdn=2', html, re.S)[0]
js_url = "https://" + js_url
try:
r = requests.get(url=js_url, headers=headers)
# 如果响应状态码不是 200,就主动抛出异常
r.raise_for_status()
# 关闭连接 !!!--非常重要
r.close()
except Exception as e:
ogger.error(e)
# 当没有异常发生时,else中的语句将会被执行
else:
html = r.text
# print(html)
# print(r.status_code)
imgs = re.findall('data-lazyload=[^\s]*([a-zA-Z]*:*//[^\s\\\]*)', html, re.S)
# print("src_imgs的值是", imgs)
for img in imgs:
if img.find('https:')
img_url = "https:" + img
else:
img_url = img
img_url = img_url.replace('jfs', 's9080x9080_jfs')
# print(img_url)
if not (img_url in result):
result.append(img_url)
# background_img
r = requests.get(url=url, headers=headers)
html = r.text
js_url = re.findall('cd.jd.com/description/channel\?skuId=[\d]*&mainSkuId=[\d]*&cdn=2', html, re.S)[0]
js_url = "https://" + js_url
# print('js_url的值是', js_url)
try:
r = requests.get(url=js_url, headers=headers)
# 如果响应状态码不是 200,就主动抛出异常
r.raise_for_status()
# 关闭连接 !!!--非常重要
r.close()
except Exception as e:
ogger.error(e)
# 当没有异常发生时,else中的语句将会被执行
else:
html = r.text
# print(html)
# print(r.status_code)
imgs = re.findall('background-image:url\(([a-zA-Z]*:*//[^\s);]*)', html, re.S)
# print("background_img的值是", imgs)
for img in imgs:
if img.find('https:')
img_url = "https:" + img
else:
img_url = img
img_url = img_url.replace('jfs', 's9080x9080_jfs')
# print(img_url)
if not (img_url in result):
result.append(img_url)
# print('result:', result)
return result
def save_image(self, index, image_title, image_url, method):
"""
保存图片到当前的data/images目录下
:param index:
:param method: 判断是主图还是详情图,可选值为header_image和detail_image
:param image_title: 图片的标题
:param image_url: 图片的url
:return:
"""
# 获取图片后缀
file_suffix = os.path.splitext(image_url)[1]
cwd = os.getcwd()
image_path_filename = ''
save_path = os.path.join(cwd, 'data\\images', image_title)
if not os.path.exists(save_path):
os.makedirs(save_path)
if method == 'header_image':
image_path_filename = os.path.join(save_path, image_title + index + file_suffix)
elif method == 'detail_image':
image_path_filename = os.path.join(save_path, 'xq_' + index + file_suffix)
# 获取图片的二级制内容到指定目录下
s = self.session_()
image = s.get(image_url)
# 保存图片
with open(image_path_filename, 'wb') as f:
f.write(image.content)
def save_tm_image(self, goods_title, url):
"""
下载天猫的头图和详情图
:param goods_title:
:param url:
:return:
"""
# 1、如果是天猫的url,下载详情图需要转成手机版的url
url = self.switch_tm_url(url)
# 2、下载天猫头图
tm_header_image_url_list = self.get_tm_header_image_url_list(url)
for index, img_url in enumerate(tm_header_image_url_list, start=1):
if not ('https:' in img_url):
img_url = 'https:' + img_url
index = str(index)
self.save_image(index=index,
image_title=goods_title,
image_url=img_url,
method='header_image')
print('爬取天猫头图成功:', goods_title)
# 3、下载天猫详情图
tm_detail_url_list = self.get_tm_detail_image_url_list(url)
for index, img_url in enumerate(tm_detail_url_list, start=1):
if not ('https:' in img_url):
img_url = 'https:' + img_url
index = str(index)
self.save_image(index=index,
image_title=goods_title,
image_url=img_url,
method='detail_image')
print('爬取天猫详情图成功:', goods_title)
def save_jd_image(self, goods_title, url):
"""
保存京东的头图和详情图
:param goods_title:
:param url:
:return:
"""
# 1、下载京东头图
jd_header_image_url_list = self.get_jd_header_image_url_list(url)
for index, img_url in enumerate(jd_header_image_url_list, start=1):
if not ('https:' in img_url):
img_url = 'https:' + img_url
index = str(index)
self.save_image(index=index,
image_title=goods_title,
image_url=img_url,
method='header_image')
print('爬取京东头图成功:', goods_title)
# 2、下载京东详情图
jd_detail_url_list = self.get_jd_detail_image_url_list(url)
for index, img_url in enumerate(jd_detail_url_list, start=1):
# if not ('https:' in img_url):
# img_url = 'https:' + img_url
index = str(index)
self.save_image(index=index,
image_title=goods_title,
image_url=img_url,
method='detail_image')
print('爬取京东详情图成功:', goods_title)
def down_all_images(self):
"""
下载天猫和京东图片,包括头图和详情图
:return:
"""
excel_data_list = self.get_excel_data()
for index, row_data in enumerate(excel_data_list, start=1):
goods_title = row_data[0].strip()
url = row_data[1]
# 下载天猫的图片
if "detail.tmall.com" in url:
self.save_tm_image(goods_title, url)
elif "item.jd.com" in url:
self.save_jd_image(goods_title, url)
def main(self):
self.down_all_images()
if __name__ == "__main__":
print('开始爬取,请稍后...')
down_pic = DownLoadPic(excel_path=r'D:\RPA\down_pic\data\样例.xlsx')
down_pic.main()
print('全部完成!!!')
运行结果: