scrapy下载图片
在scrapy中我们可以使用ImagesPipeline这个管道类来进行相关操作,这个类是scrapy已经封装好的了,我们直接拿来用即可。
为什么要用scrapy内置的下载文件的方法
-
避免下载已经下载过的数据
-
异步下载,效率高
需安装:pillow 图像处理库 pip install pillow
在使用ImagesPipeline下载图片数据时,我们需要对其中的三个管道类方法进行重写
需要导入模块:from scrapy.pipelines.images import ImagesPipeline
spider.py
import json
from urllib.parse import urlencode
import scrapy
from ..items import BdimgItem
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
img_url ="https://image.baidu.com/search/acjson?"
params={"tn":"resultjson_com",
"logid":"10085787444703054262",
"ipn":"rj",
"ct":"201326592",
"is":"",
"fp":"result",
"fr":"",
"word":"樱桃小丸子",
"queryWord":"樱桃小丸子",
"cl":"2",
"lm":"-1",
"ie":"utf-8",
"oe":"utf-8",
"adpicid":"",
"st":"",
"z":"",
"ic":"",
"hd":"",
"latest":"",
"copyright":"",
"s":"",
"se":"",
"tab":"",
"width":"",
"height":"",
"face":"",
"istype":"",
"qc":"",
"nc":"1",
"expermode":"",
"nojc":"",
"isAsync":"",
"pn": {},
"rn":"30",
"gsm": {}, # gsm的值为pn的十六进制,用hex()转为十进制
"1661427776274":"",
}
# 拼接url和参数
start_urls = [img_url+urlencode(params).format(0,0)]
def parse(self, response, **kwargs):
data_list = json.loads(response.text)['data']
# 实例化存储
item = BdimgItem()
# 找到所有的data
for data in data_list:
try:
item['url'] = data['thumbURL']
item['name'] = data['shituToken']
yield item
except:
continue
# 请求完start_urls后重新构造url,获取二三页
for i in range(1,3):
next_url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=11086734970871598075&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E6%A8%B1%E6%A1%83%E5%B0%8F%E4%B8%B8%E5%AD%90&queryWord=%E6%A8%B1%E6%A1%83%E5%B0%8F%E4%B8%B8%E5%AD%90&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm={}&1661422302527='.format(i*30,hex(i*30))
yield scrapy.Request(url=next_url,callback=self.parse)
items.py
import scrapy
class BdimgItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
url = scrapy.Field()
pipelines.py
import scrapy
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline #scrapy爬取图片必要模块
import requests
class BdimgPipeline(ImagesPipeline): # 必须继承
# 对图片地址进行数据请求
def get_media_requests(self, item, info):
# meta 传递到下一个函数中,meta = {‘key1’:value1}
yield scrapy.Request(item['url'],meta={'name':item['name']})
# 自定义图片名和后缀
def file_path(self, request, response=None, info=None, *, item=None):
# 通过meta的key1获取values1值
name = request.meta['name']
# 拼接,获取的url没有后缀,
data = name + ".jpg"
print('正在保存',data)
return data
#IMAGES_STORE = "D:/应用/pycharm/爬虫/13_scrapy下载图片/作业/图片/" settings添加图片存储路径放管道后
# 将item返回给管道
def item_completed(self, results, item, info):
return item
# requests传统方法
# class BdimgPipeline:
# def process_item(self, item, spider):
# src = item['url']
# # 定义图片名
# name = src.split('=')[1][:10]
# file_path = "D:/应用/pycharm/爬虫/13_scrapy下载图片/作业/图片"
#
# res = requests.get(src).content
#
# with open(file_path+"/"+name+".jpg","wb") as f1:
# print("正在保存",name)
# f1.write(res)
#
# return item
settings.py
# 定义用户代理(17行)
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
# 指定只打印错误日志 (自行添加)
LOG_LEVEL = "ERROR"
# Obey robots.txt rules
# 不遵守君子协议
ROBOTSTXT_OBEY = False
# 开启管道(60几行)
ITEM_PIPELINES = {
'bdimg.pipelines.BdimgPipeline': 300,
}
# 图片存储路径 (自行添加在管道后)
IMAGES_STORE = "D:/应用/pycharm/爬虫/13_scrapy下载图片/作业/图片/"